In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb

In [28]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_example = pd.read_csv('./data/sample_submission.csv')

In [29]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Feature engineering

In [30]:
# Extract deck and side
df_train.Cabin = df_train.Cabin.fillna('none/none/none')
df_train[['Deck','Num','Side']] = df_train.Cabin.str.split('/', expand=True)
df_train.loc[df_train.Deck=='none',['Deck','Side']] = [None, None]

# Remove blanck spaces in Destination
df_train.Destination = df_train.Destination.str.replace(' ','_')

# Dummy variables
dummies = ['HomePlanet','Destination','Deck','Side']

# Fill missing values with mode
for col in dummies:
    df_train[col].fillna(df_train[col].mode()[0], inplace=True)

# Dummie variables from this columns
df_train = pd.get_dummies(df_train, columns=dummies)

# Convert CryoSleep to dummy
df_train.CryoSleep.fillna(df_train.CryoSleep.mode()[0], inplace=True)
df_train.CryoSleep = df_train.CryoSleep.astype(int)

# Fill missing values and scale Age
df_train.Age.fillna(round(df_train.Age.mean()), inplace=True)
df_train.Age = df_train.Age / df_train.Age.max()

# Classify spend variables
spend = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in spend:
    mean_value = df_train[col].mean()
    df_train[col+'_class'] = 0
    df_train.loc[(df_train[col]>0)&(df_train[col]<mean_value),col+'_class']=0.5
    df_train.loc[(df_train[col]>=mean_value),col+'_class'] = 1

# Variable selection and train-test split

In [31]:
df_train.columns

Index(['PassengerId', 'CryoSleep', 'Cabin', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported',
       'Num', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55_Cancri_e', 'Destination_PSO_J318.5-22',
       'Destination_TRAPPIST-1e', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D',
       'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Side_P', 'Side_S',
       'RoomService_class', 'FoodCourt_class', 'ShoppingMall_class',
       'Spa_class', 'VRDeck_class'],
      dtype='object')

In [32]:
selected_columns = ['CryoSleep','Age','HomePlanet_Earth','HomePlanet_Europa',
                    'HomePlanet_Mars','Destination_55_Cancri_e',
                    'Destination_PSO_J318.5-22','Destination_TRAPPIST-1e',
                    'Deck_A','Deck_B','Deck_C','Deck_D','Deck_E','Deck_F',
                    'Deck_G','Deck_T','Side_P','Side_S','RoomService_class',
                    'FoodCourt_class','ShoppingMall_class','Spa_class',
                    'VRDeck_class']

X = df_train[selected_columns]
y = df_train['Transported']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
                                                    random_state=42)

# Models

## Random forest

In [49]:
# Define model
clf_forest = RandomForestClassifier(n_estimators=250, max_depth=10)

# Train model
clf_forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, n_estimators=250)

In [50]:
# Prediction in train set
pred = clf_forest.predict(X_train)
train_acc = accuracy_score(y_train, pred)

# Predictions in test set
pred_test = clf_forest.predict(X_test)
test_acc = accuracy_score(y_test, pred_test)

print(f"""Accuracy in train set: {train_acc}
Accuracy in test set: {test_acc}""")

Accuracy in train set: 0.8431118780557952
Accuracy in test set: 0.765382403680276


In [36]:
confusion_matrix(y_train, pred)

array([[2908,  546],
       [ 542, 2958]])

In [37]:
confusion_matrix(y_test, pred_test)

array([[662, 199],
       [209, 669]])

### Cross validation

In [38]:
parameters = {
    'n_estimators': [150,200,250,300],
    'max_depth': [5,10,15,20,25]
}
forest = RandomForestClassifier()
clf = GridSearchCV(estimator=forest, param_grid=parameters,
                   n_jobs=-1, cv=5)

In [39]:
clf.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15, 20, 25],
                         'n_estimators': [150, 200, 250, 300]})

In [40]:
clf.best_params_

{'max_depth': 10, 'n_estimators': 250}

In [41]:
clf.best_score_

0.7736139905544699

## XGBOOST

In [53]:
clf_xgb = xgb.XGBClassifier()

clf_xgb.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [54]:
# Prediction in train set
pred = clf_xgb.predict(X_train)
train_acc = accuracy_score(y_train, pred)

# Predictions in test set
pred_test = clf_xgb.predict(X_test)
test_acc = accuracy_score(y_test, pred_test)

print(f"""Accuracy in train set: {train_acc}
Accuracy in test set: {test_acc}""")

Accuracy in train set: 0.8717285015818234
Accuracy in test set: 0.7521564117308798
