In [194]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from lightgbm import LGBMClassifier

import warnings
import pickle

warnings.filterwarnings("ignore")

In [195]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df = pd.DataFrame(df)

In [196]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["PassengerId", "Name", "Transported", "Cabin"])
    Y = df["Transported"]
    return X, Y

X, Y = get_X_Y(df)

In [197]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

# Check columns
print(X_train.columns, X_train.shape)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object') (6954, 10)


In [198]:
# Define preprocessing pipeline with both ordinal and one-hot encoding
categorical_features = ["HomePlanet", "CryoSleep", "Destination"]
numerical_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [199]:
# Get unique elements for each column
for x in categorical_features:
    print(x, X_train[x].unique())

HomePlanet ['Europa' 'Earth' 'Mars' nan]
CryoSleep [False True nan]
Destination ['55 Cancri e' 'TRAPPIST-1e' nan 'PSO J318.5-22']


In [200]:
# Separate transformers for categorical and numerical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer())
])

In [201]:
# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define model
model = RandomForestClassifier()

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [202]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

In [203]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [204]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.780333525014376


In [205]:
from sklearn.model_selection import cross_val_score

# cross_val_score(pipeline, X_test, Y_test, cv=5, scoring="accuracy").mean()

In [208]:
import csv

loaded_model = pickle.load(open(str('model.pkl'), "rb"))
column_names = X_train.columns
print(column_names)
def generate_submission(test_file):
    with open(test_file, mode='r', newline='') as file:
        # Create a CSV reader object
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header
        # Iterate through the rows of the CSV file
        for row in csv_reader:
            print("Original Row:", row)
            # Select columns to keep (2nd, 3rd and from 5th to second last)
            row = [row[1], row[2]] + row[4:-1]
            print("Filtered Row:", row, len(row))
            # Convert the row to a DataFrame
            row_df = pd.DataFrame([row], columns=column_names)
            # Predict using the model
            ans = loaded_model.predict(row_df)
            print("Prediction:", ans)

test_file = 'test.csv'
generate_submission(test_file)

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object')
Original Row: ['0013_01', 'Earth', 'True', 'G/3/S', 'TRAPPIST-1e', '27.0', 'False', '0.0', '0.0', '0.0', '0.0', '0.0', 'Nelly Carsoning']
Filtered Row: ['Earth', 'True', 'TRAPPIST-1e', '27.0', 'False', '0.0', '0.0', '0.0', '0.0', '0.0'] 10
Prediction: [False]
Original Row: ['0018_01', 'Earth', 'False', 'F/4/S', 'TRAPPIST-1e', '19.0', 'False', '0.0', '9.0', '0.0', '2823.0', '0.0', 'Lerome Peckers']
Filtered Row: ['Earth', 'False', 'TRAPPIST-1e', '19.0', 'False', '0.0', '9.0', '0.0', '2823.0', '0.0'] 10
Prediction: [False]
Original Row: ['0019_01', 'Europa', 'True', 'C/0/S', '55 Cancri e', '31.0', 'False', '0.0', '0.0', '0.0', '0.0', '0.0', 'Sabih Unhearfus']
Filtered Row: ['Europa', 'True', '55 Cancri e', '31.0', 'False', '0.0', '0.0', '0.0', '0.0', '0.0'] 10
Prediction: [ True]
Original Row: ['0021_01', 'Europa', 'False', 'C/1/S', 'TR

ValueError: could not convert string to float: ''