In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

### Model Description

#### train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
- PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination - The planet the passenger will be debarking to.
- Age - The age of the passenger.
- VIP - Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name - The first and last names of the passenger.
- Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

### Data Cleaning & Encoding

In [6]:
def bool_to_int(df, cols):
    for col in cols:
        df[col] = df[col].apply(lambda x: False if pd.isna(x) else x)
        df[col] = df[col].astype(bool).astype(int)


def process_data(data):
    df = data.copy()
    
    # fill na Homeplanet and Destination with Other
    df['HomePlanet'] = df['HomePlanet'].apply(lambda x: 'Other' if pd.isna(x) else x)
    df['Destination'] = df['Destination'].apply(lambda x: 'Other' if pd.isna(x) else x)
    df['Cabin'] = df['Cabin'].apply(lambda x: 'Other/0/Other' if pd.isna(x) else x)

    # Split to Encode
    df[['Passenger_Group','Passenger_Number']] = df['PassengerId'].str.split('_', expand=True).astype(int)
    df[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df.Cabin.str.split('/', expand=True)

    # Mean filling Age
    mean_age = df.Age.mean()
    df['Age'] = df['Age'].apply(lambda x: mean_age if pd.isna(x) else x)

    # Convert to int
    bool_to_int(df, ['CryoSleep', 'VIP', 'Cabin_Num'])
    
    # Drop Columns
    df.drop(
        ['Cabin', 'PassengerId', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']
        , axis=1, inplace=True
    )

    categorical_columns = ['HomePlanet', 'Destination', 'Cabin_Deck', 'Cabin_Side']
    df = pd.get_dummies(df, categorical_columns, drop_first=True, dtype=int)

    return df

In [7]:
train_df = process_data(train_data)

x_train = train_df.drop(['Transported'], axis=1)
y_train = train_df['Transported']

x_test = process_data(test_data)

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int32  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int32  
 3   Passenger_Group            8693 non-null   int32  
 4   Passenger_Number           8693 non-null   int32  
 5   Cabin_Num                  8693 non-null   int32  
 6   HomePlanet_Europa          8693 non-null   int32  
 7   HomePlanet_Mars            8693 non-null   int32  
 8   HomePlanet_Other           8693 non-null   int32  
 9   Destination_Other          8693 non-null   int32  
 10  Destination_PSO J318.5-22  8693 non-null   int32  
 11  Destination_TRAPPIST-1e    8693 non-null   int32  
 12  Cabin_Deck_B               8693 non-null   int32  
 13  Cabin_Deck_C               8693 non-null   int32

### Model Selection

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Dictionary with models and simplified parameter grids
models_params = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=10000),
        "params": {
            "penalty": ["l1", "l2"],  # Regularization type
            "C": [0.1, 1, 10],  # Regularization strength
            "solver": ["liblinear"],  # Solver to use for optimization
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "criterion": ["gini", "entropy"],  # Split quality measure
            "max_depth": [None, 10, 20],  # Maximum depth of the tree
            "min_samples_split": [2, 10],  # Minimum number of samples required to split a node
            "min_samples_leaf": [1, 2],  # Minimum number of samples required to be a leaf node
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],  # Number of trees in the forest
            "criterion": ["gini", "entropy"],  # Split quality measure
            "max_depth": [None, 10, 20],  # Maximum depth of trees
            "min_samples_split": [2, 10],  # Minimum samples required to split a node
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "C": [1, 10],  # Regularization strength
            "kernel": ["rbf", "linear"],  # Kernel type
            "gamma": ["scale"],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
        }
    },
    "KNeighborsClassifier": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 10],  # Number of neighbors to use
            "weights": ["uniform", "distance"],  # Weight function used in prediction
            "metric": ["euclidean"],  # Distance metric to use
        }
    }
}

In [11]:
from sklearn.model_selection import GridSearchCV

scores = []

for model, params in models_params.items():
    print("\nStarted :", model)
    clf = GridSearchCV(estimator=params["model"],
                       param_grid=params["params"],
                       cv=5,
                       verbose=3,
                       scoring="accuracy")
    clf.fit(x_train, y_train)
    scores.append({
        'model': model,
        'params': clf.best_params_,
        'best_score': clf.best_score_
    })


Started : LogisticRegression
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Started : DecisionTreeClassifier
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Started : RandomForestClassifier
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Started : SVC
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Started : KNeighborsClassifier
Fitting 5 folds for each of 6 candidates, totalling 30 fits


### Use the best model returned

In [13]:
best_model_info = max(scores, key=lambda x: x['best_score'])

print("Best Model:", best_model_info['model'])
print("Best Parameters:", best_model_info['params'])
print("Best Score:", best_model_info['best_score'])

best_model = models_params[best_model_info['model']]['model']
best_model.set_params(**best_model_info['params'])

best_model.fit(x_train, y_train)

Best Model: RandomForestClassifier
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.7241494291588554


In [14]:
best_model.score(x_train, y_train)

0.7925917404808467

### Predict and Output to CSV

In [32]:
predictions = best_model.predict(x_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('output.csv', index=False)

### Export model to Pickle

In [30]:
import pickle

file_name = "spaceship_titanic_model_" + best_model_info['model'] + ".pickle"

with open(file_name, "wb") as op_file:
    pickle.dump(best_model, op_file)