In [1]:
import pandas as pd
import seaborn as sns

In [2]:
train_data = pd.read_csv('train.csv')

### EDA

In [3]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.shape

(8693, 14)

In [5]:
train_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [8]:
train_data.isna().sum().sort_values(ascending=False) / train_data.shape[0] * 100

CryoSleep       2.496261
ShoppingMall    2.392730
VIP             2.335212
HomePlanet      2.312205
Name            2.300702
Cabin           2.289198
VRDeck          2.162660
FoodCourt       2.105142
Spa             2.105142
Destination     2.093639
RoomService     2.082135
Age             2.059128
PassengerId     0.000000
Transported     0.000000
dtype: float64

### Pre-processing

In [9]:
def preprocess(data):
    data[["Deck", "Num", "Side"]] = data.Cabin.str.split('/', expand=True)
    data["Deck"] = data.Deck.fillna(data["Deck"].mode().values[0])
    data.loc[data.Side.isna(), "Side"] = data[data.Side.isna()].apply(lambda x: data[data.Deck == x.Deck].Side.mode().values[0], axis=1)
    data['CabinLocation'] = data['Deck'].astype(str) + '_' + data['Side']

    data["Age"] = data["Age"].fillna(data.Age.median())
    data["Age"] = data.Age.astype(int)
    
    data["RoomService"] = data["RoomService"].fillna(0)
    data["FoodCourt"] = data["FoodCourt"].fillna(0)
    data["ShoppingMall"] = data["ShoppingMall"].fillna(0)
    data["Spa"] = data["Spa"].fillna(0)
    data["VRDeck"] = data["VRDeck"].fillna(0)
    
    data["TotalSpending"] = data["RoomService"] + data["FoodCourt"] + data["ShoppingMall"] + data["Spa"] + data["VRDeck"]
    
    data["CryoSleep"] = data.CryoSleep.fillna(data.CryoSleep.mode().values[0])
    data["CryoSleep"] = data.CryoSleep.astype(int)
    
    
    data["VIP"] = data.VIP.fillna(data.VIP.mode().values[0])
    data["VIP"] = data.VIP.astype(int)
    
    data["HomePlanet"] = data.HomePlanet.fillna(data.HomePlanet.mode().values[0])
    data["Destination"] = data.Destination.fillna(data.Destination.mode().values[0])
    
    data['FamilyID'] = data['PassengerId'].str.split('_').str[0]
    data['FamilyGroupSize'] = data.groupby('FamilyID')['PassengerId'].transform('count')
    
    data.drop(columns=["Name", "Cabin", "Num", "PassengerId", "FamilyID"], axis=1, inplace=True)
    
    return data


In [10]:
train_data = preprocess(train_data)

### Feature Engineering

In [11]:
X_train = train_data.drop(columns=["Transported"], axis=1)
y_train = train_data['Transported']

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
feature_data = scaler.fit_transform(X_train[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Age"]])

In [15]:
feature_matrix = pd.DataFrame(feature_data, columns=scaler.get_feature_names_out())

In [16]:
feature_matrix = pd.concat([feature_matrix, X_train[["CryoSleep", "VIP", "HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"]].reset_index().drop(columns=["index"], axis=0)], axis=1)

In [17]:
X_train['CabinLocation'], fact_unique = X_train.CabinLocation.factorize()

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
ohc = OneHotEncoder(sparse=False, handle_unknown="ignore")

In [20]:
deck_side_features = ohc.fit_transform(feature_matrix[["HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"]])

In [21]:
feature_matrix = pd.concat([feature_matrix, pd.DataFrame(deck_side_features, columns=ohc.get_feature_names_out())], axis=1)

In [22]:
feature_matrix.drop(columns=["HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"], axis=1, inplace=True)

In [23]:
feature_matrix

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Age,CryoSleep,VIP,HomePlanet_Earth,HomePlanet_Europa,...,CabinLocation_T_P,CabinLocation_T_S,FamilyGroupSize_1,FamilyGroupSize_2,FamilyGroupSize_3,FamilyGroupSize_4,FamilyGroupSize_5,FamilyGroupSize_6,FamilyGroupSize_7,FamilyGroupSize_8
0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0.711945,0,0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,-0.334037,0,0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.268001,1.959998,-0.283579,5.695623,-0.219796,2.036857,0,1,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.333105,0.523010,0.336851,2.687176,-0.092818,0.293552,0,0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.125652,-0.237159,-0.031059,0.231374,-0.261240,-0.891895,0,0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,-0.333105,3.992336,-0.283579,1.189173,-0.197751,0.851410,0,1,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,-0.752431,1,0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8690,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,-0.194573,0,0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8691,-0.333105,0.376365,-0.283579,0.043013,2.589576,0.223820,0,0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model Training

In [48]:
from sklearn.metrics import accuracy_score

In [24]:
from sklearn.svm import SVC

In [214]:
svm_model = SVC()

In [215]:
from sklearn.model_selection import KFold, cross_val_score

In [216]:
scores = cross_val_score(svm_model, feature_matrix, y_train)

In [217]:
print(scores)

[0.7786084  0.79413456 0.79585969 0.80667434 0.80379747]


In [218]:
import numpy as np

In [219]:
np.mean(scores)

0.7958148903745457

In [220]:
svm_model.fit(feature_matrix, y_train)

In [223]:
svm_params = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [1, 3, 5, 7, 9],
    'gamma': [0.1, 0.3, 0.5, 1],
}

In [224]:
gs_svm = GridSearchCV(svm_model, svm_params, n_jobs=-1, cv=5, scoring='accuracy', verbose=3)

In [225]:
gs_svm.fit(feature_matrix, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [226]:
print("Best Parameters: ", gs_svm.best_params_)
print("Best Accuracy: {:.4f}".format(gs_svm.best_score_))

Best Parameters:  {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
Best Accuracy: 0.7970


In [227]:
svm_model = gs_svm.best_estimator_

In [228]:
svm_pred = svm_model.predict(feature_matrix)

In [229]:
accuracy_score(y_train, svm_pred)

0.802829863108248

[CV 4/5] END ....................C=1, gamma=0.1;, score=0.795 total time=   1.3s
[CV 2/5] END ....................C=1, gamma=0.5;, score=0.799 total time=   1.3s
[CV 5/5] END ......................C=1, gamma=1;, score=0.797 total time=   1.3s
[CV 3/5] END ....................C=3, gamma=0.3;, score=0.805 total time=   1.6s
[CV 4/5] END ....................C=3, gamma=0.5;, score=0.793 total time=   1.5s
[CV 2/5] END ....................C=5, gamma=0.1;, score=0.798 total time=   1.8s
[CV 5/5] END ....................C=5, gamma=0.3;, score=0.798 total time=   1.9s
[CV 3/5] END ......................C=5, gamma=1;, score=0.806 total time=   1.9s
[CV 2/5] END ....................C=7, gamma=0.3;, score=0.796 total time=   2.2s
[CV 5/5] END ....................C=7, gamma=0.5;, score=0.800 total time=   2.3s
[CV 3/5] END ....................C=9, gamma=0.1;, score=0.804 total time=   2.3s
[CV 2/5] END ....................C=9, gamma=0.5;, score=0.798 total time=   2.3s
[CV 5/5] END ...............

### XGBoost

In [39]:
import xgboost as xgb

In [152]:
# {'colsample_bytree': 0.8, 'gamma': 3, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.5} #Best on test data
clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42, n_jobs=-1, n_estimators=500)

In [153]:
scores = cross_val_score(clf, feature_matrix, y_train, cv=kf)

In [154]:
print(scores)

[0.77170788 0.78550891 0.77573318 0.79631761 0.78481013]


In [155]:
np.mean(scores)

0.7828155408548622

In [156]:
clf.fit(feature_matrix, y_train)

In [157]:
xgb_pred = clf.predict(feature_matrix)

In [158]:
accuracy_score(y_train, xgb_pred)

0.9497296675486023

#### XGBoost Grid Search

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
from sklearn.metrics import accuracy_score

In [116]:
# {'colsample_bytree': 0.8, 'gamma': 3, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.5}
xgb_params = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 10],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

In [117]:
gs_xgb = GridSearchCV(clf, xgb_params, n_jobs=-1, cv=5, scoring='accuracy', verbose=3)

In [118]:
gs_xgb.fit(feature_matrix, y_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV 5/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.5;, score=0.800 total time=   2.4s
[CV 1/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=5, subsample=0.5;, score=0.782 total time=   2.3s
[CV 4/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=5, subsample=0.8;, score=0.805 total time=   1.9s
[CV 2/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=10, subsample=0.5;, score=0.791 total time=   2.2s
[CV 5/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=10, subsample=0.8;, score=0.800 total time=   2.0s
[CV 3/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, subsample=0.5;, score=0.803 total time=   3.6s
[CV 1/5] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_de

In [119]:
print("Best Parameters: ", gs_xgb.best_params_)
print("Best Accuracy: {:.4f}".format(gs_xgb.best_score_))

Best Parameters:  {'colsample_bytree': 0.8, 'gamma': 3, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.5}
Best Accuracy: 0.8043


In [120]:
clf = gs_xgb.best_estimator_

In [121]:
xgb_pred = clf.predict(feature_matrix)

In [122]:
accuracy_score(y_train, xgb_pred)

0.8211204417347291

### Test Data Feature Engineering

In [139]:
test_data = pd.read_csv("test.csv")

In [140]:
pid_df = test_data[["PassengerId"]]

In [141]:
test_data = preprocess(test_data)

In [142]:
test_data["CabinLocation"] = pd.Categorical(test_data.CabinLocation, categories=fact_unique)

In [143]:
test_feature_data = scaler.transform(test_data[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Age"]])
test_feature_matrix = pd.DataFrame(test_feature_data, columns=scaler.get_feature_names_out())
test_feature_matrix = pd.concat([test_feature_matrix, test_data[["CryoSleep", "VIP", "HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"]].reset_index().drop(columns=["index"], axis=0)], axis=1)
deck_side_features_test = ohc.transform(test_feature_matrix[["HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"]])
test_feature_matrix = pd.concat([test_feature_matrix, pd.DataFrame(deck_side_features_test, columns=ohc.get_feature_names_out())], axis=1)
test_feature_matrix.drop(columns=["HomePlanet", "Destination", "Deck", "Side", "CabinLocation", "FamilyGroupSize"], axis=1, inplace=True)


#### SVM Output

In [230]:
svm_test_pred = svm_model.predict(test_feature_matrix)

In [231]:
output = pd.concat([pid_df, pd.DataFrame(svm_test_pred, columns=["Transported"])], axis=1)

In [232]:
output.to_csv("output_full_svm_gs.csv", index=False)

#### XGB Output

In [159]:
xgb_test_pred = clf.predict(test_feature_matrix)

In [160]:
output = pd.concat([pid_df, pd.DataFrame(xgb_test_pred, columns=["Transported"])], axis=1)

In [161]:
output['Transported'] = output.Transported.astype(bool)

In [162]:
output.to_csv("output_full_xgb1.csv", index=False)