In [343]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from lightgbm import LGBMClassifier

import warnings
import pickle

warnings.filterwarnings("ignore")

In [344]:
# Load data
excel_file_path = "./train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df = pd.DataFrame(df)

In [345]:
def extract_first_last(s):
    if pd.isna(s):  # Check if the value is NaN
        return np.nan
    return s[0] + s[-1]

df["Cabin"] = df["Cabin"].apply(extract_first_last)

In [346]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["PassengerId", "Name", "Transported"])
    Y = df["Transported"]
    return X, Y

# Apply the function to the "Cabin" column

X, Y = get_X_Y(df)

In [347]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

# Check columns
print(X_train.columns, X_train.shape)

Index(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object') (6954, 11)


In [348]:
# Define preprocessing pipeline with both ordinal and one-hot encoding
categorical_features = ["HomePlanet", "CryoSleep", "Destination", "Cabin"]
numerical_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [349]:
# Get unique elements for each column
for x in categorical_features:
    print(x, X_train[x].unique())

HomePlanet ['Europa' 'Earth' 'Mars' nan]
CryoSleep [False True nan]
Destination ['55 Cancri e' 'TRAPPIST-1e' nan 'PSO J318.5-22']
Cabin ['CP' 'FS' 'DP' 'EP' 'GS' 'BS' 'DS' 'FP' nan 'BP' 'GP' 'ES' 'CS' 'AP' 'AS'
 'TS' 'TP']


In [350]:
# Separate transformers for categorical and numerical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer())
])

In [351]:
# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Define model
model = RandomForestClassifier()

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [352]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

In [353]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [354]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.79700977573318


In [355]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipeline, X_test, Y_test, cv=5, scoring="accuracy").mean()

0.7728725694789492

In [356]:
import pandas as pd
import numpy as np
import pickle

# Load the trained model
loaded_model = pickle.load(open('model.pkl', "rb"))

# Define the columns expected by the model
column_names = ['HomePlanet', 'CryoSleep', "Cabin", 'Destination', 'Age', 'VIP', 'RoomService',
                'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    df["Cabin"] = df["Cabin"].apply(extract_first_last)

    # Select the relevant columns
    filtered_df = df[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'Cabin',
                      'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

    # Iterate through the rows of the DataFrame and make predictions
    predictions = []
    for index, row in filtered_df.iterrows():
        row_df = pd.DataFrame([row], columns=column_names)
        ans = loaded_model.predict(row_df)
        predictions.append(ans[0])
    
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df['Transported'] = predictions
    # Save the results to a new CSV file
    submission_df = original_df[['PassengerId', 'Transported']]
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = 'test.csv'
generate_submission(test_file)

Submission file saved as 'submission.csv'
