In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix
from xgboost import XGBClassifier
import pickle



In [30]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [32]:
df['Cabin'].nunique()

76

In [33]:
df.loc[df['Age']<10]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
21,913,3,"Olsen, Master. Artur Karl",male,9.0,0,1,C 17368,3.1708,,S
80,972,3,"Boulos, Master. Akar",male,6.0,1,1,2678,15.2458,,C
89,981,2,"Wells, Master. Ralph Lester",male,2.0,1,1,29103,23.0,,S
117,1009,3,"Sandstrom, Miss. Beatrice Irene",female,1.0,1,1,PP 9549,16.7,G6,S
161,1053,3,"Touma, Master. Georges Youssef",male,7.0,1,1,2650,15.2458,,C
194,1086,2,"Drew, Master. Marshall Brines",male,8.0,0,2,28220,32.5,,S
196,1088,1,"Spedden, Master. Robert Douglas",male,6.0,0,2,16966,134.5,E34,C
201,1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
203,1095,2,"Quick, Miss. Winifred Vera",female,8.0,1,1,26360,26.0,,S
250,1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S


In [34]:
df.drop('Cabin', axis = 1, inplace = True)

In [35]:
df.isna().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64

In [36]:
df_prefixes = df.copy()

In [37]:
df_prefixes['LastName'] = df_prefixes['Name'].str.split(',').map(lambda x: x[0])

In [38]:
df_prefixes['NamePrefixRaw']= df_prefixes['Name'].str.split(', ').map(lambda x: x[1])

In [39]:
df_prefixes['NamePrefix']= df_prefixes['NamePrefixRaw'].str.split('.').map(lambda x: x[0])
df_prefixes.drop('NamePrefixRaw', axis = 1, inplace = True)

In [40]:
df_prefixes

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,LastName,NamePrefix
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Kelly,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,S,Wilkes,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Myles,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Wirz,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Hirvonen,Mrs
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,S,Spector,Mr
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C,Oliva y Ocana,Dona
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S,Saether,Mr
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,S,Ware,Mr


In [41]:
df_prefixes['NamePrefix'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [42]:
pwa_df = df_prefixes.copy()
pwa_df = pwa_df.loc[pwa_df['Age'].notnull()]
pwa_grouped = pd.DataFrame(pwa_df.groupby('NamePrefix')['Age'].mean())
pwa_grouped


Unnamed: 0_level_0,Age
NamePrefix,Unnamed: 1_level_1
Col,50.0
Dona,39.0
Dr,53.0
Master,7.406471
Miss,21.774844
Mr,32.0
Mrs,38.903226
Rev,35.5


In [43]:
pwoa_df = df_prefixes.copy()
pwoa_df = pwoa_df.loc[pwoa_df['Age'].isna()]
pwoa_df['NamePrefix'].unique()


array(['Mr', 'Mrs', 'Miss', 'Ms', 'Master'], dtype=object)

In [44]:
pwoa_df.loc[pwoa_df['NamePrefix'] == 'Mr', 'Age']=32.368090
pwoa_df.loc[pwoa_df['NamePrefix'] == 'Mrs', 'Age']=35.898148
pwoa_df.loc[pwoa_df['NamePrefix'] == 'Miss', 'Age']=21.773973
pwoa_df.loc[pwoa_df['NamePrefix'] == 'Ms', 'Age']=28.000000
pwoa_df.loc[pwoa_df['NamePrefix'] == 'Master', 'Age']=4.574167


pwoa_df.set_index('PassengerId', inplace = True, drop = True)
pwoa_df

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,LastName,NamePrefix
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
902,3,"Ilieff, Mr. Ylio",male,32.368090,0,0,349220,7.8958,S,Ilieff,Mr
914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,35.898148,0,0,PC 17598,31.6833,S,Flegenheim,Mrs
921,3,"Samaan, Mr. Elias",male,32.368090,2,0,2662,21.6792,C,Samaan,Mr
925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,35.898148,1,2,W./C. 6607,23.4500,S,Johnston,Mrs
928,3,"Roth, Miss. Sarah A",female,21.773973,0,0,342712,8.0500,S,Roth,Miss
...,...,...,...,...,...,...,...,...,...,...,...
1300,3,"Riordan, Miss. Johanna Hannah""""",female,21.773973,0,0,334915,7.7208,Q,Riordan,Miss
1302,3,"Naughton, Miss. Hannah",female,21.773973,0,0,365237,7.7500,Q,Naughton,Miss
1305,3,"Spector, Mr. Woolf",male,32.368090,0,0,A.5. 3236,8.0500,S,Spector,Mr
1308,3,"Ware, Mr. Frederick",male,32.368090,0,0,359309,8.0500,S,Ware,Mr


In [45]:
pwa_df.set_index('PassengerId', inplace = True, drop = True)
pwa_df

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,LastName,NamePrefix
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Kelly,Mr
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,S,Wilkes,Mrs
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Myles,Mr
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Wirz,Mr
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Hirvonen,Mrs
...,...,...,...,...,...,...,...,...,...,...,...
1301,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.7750,S,Peacock,Miss
1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,Q,Minahan,Mrs
1304,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.7750,S,Henriksson,Miss
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C,Oliva y Ocana,Dona


In [46]:
df_clean = pwa_df.append(pwoa_df)
df_clean.sort_index(inplace = True)

In [47]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Name        418 non-null    object 
 2   Sex         418 non-null    object 
 3   Age         418 non-null    float64
 4   SibSp       418 non-null    int64  
 5   Parch       418 non-null    int64  
 6   Ticket      418 non-null    object 
 7   Fare        417 non-null    float64
 8   Embarked    418 non-null    object 
 9   LastName    418 non-null    object 
 10  NamePrefix  418 non-null    object 
dtypes: float64(2), int64(3), object(6)
memory usage: 39.2+ KB


In [48]:
df_clean = df_clean.fillna(13.675550101832993)

In [50]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Name        418 non-null    object 
 2   Sex         418 non-null    object 
 3   Age         418 non-null    float64
 4   SibSp       418 non-null    int64  
 5   Parch       418 non-null    int64  
 6   Ticket      418 non-null    object 
 7   Fare        418 non-null    float64
 8   Embarked    418 non-null    object 
 9   LastName    418 non-null    object 
 10  NamePrefix  418 non-null    object 
dtypes: float64(2), int64(3), object(6)
memory usage: 39.2+ KB


In [51]:
df_clean.drop(['Name', 'Ticket', 'LastName'], axis = 1, inplace = True)

In [52]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Sex         418 non-null    object 
 2   Age         418 non-null    float64
 3   SibSp       418 non-null    int64  
 4   Parch       418 non-null    int64  
 5   Fare        418 non-null    float64
 6   Embarked    418 non-null    object 
 7   NamePrefix  418 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 29.4+ KB


In [58]:
cat_cols_df = df_clean[['Pclass', 'Sex', 'Embarked', 'NamePrefix']].copy()
ohe = OneHotEncoder()
ohe.fit(cat_cols_df)
ohe_cat_cols = ohe.transform(cat_cols_df).toarray()
ohe_cat_cols_df = pd.DataFrame(ohe_cat_cols, columns = ohe.get_feature_names(cat_cols_df.columns), index=cat_cols_df.index)
ohe_cat_cols_df.drop('NamePrefix_Dona', axis = 1, inplace = True)
ohe_cat_cols_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pclass_1           418 non-null    float64
 1   Pclass_2           418 non-null    float64
 2   Pclass_3           418 non-null    float64
 3   Sex_female         418 non-null    float64
 4   Sex_male           418 non-null    float64
 5   Embarked_C         418 non-null    float64
 6   Embarked_Q         418 non-null    float64
 7   Embarked_S         418 non-null    float64
 8   NamePrefix_Col     418 non-null    float64
 9   NamePrefix_Dr      418 non-null    float64
 10  NamePrefix_Master  418 non-null    float64
 11  NamePrefix_Miss    418 non-null    float64
 12  NamePrefix_Mr      418 non-null    float64
 13  NamePrefix_Mrs     418 non-null    float64
 14  NamePrefix_Ms      418 non-null    float64
 15  NamePrefix_Rev     418 non-null    float64
dtypes: float64(16)
memory u

In [59]:
scaler = StandardScaler()
cat_cols_list = ['Pclass', 'Sex', 'Embarked', 'NamePrefix', 'Survived']
num_cols_df = df_clean[df_clean.columns.difference(cat_cols_list)].copy()
num_cols_df = pd.DataFrame(scaler.fit_transform(num_cols_df),index=num_cols_df.index, columns=num_cols_df.columns)
num_cols_df.head()

Unnamed: 0_level_0,Age,Fare,Parch,SibSp
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
892,0.333744,-0.497374,-0.400248,-0.49947
893,1.294308,-0.512238,-0.400248,0.616992
894,2.446984,-0.464061,-0.400248,-0.49947
895,-0.242594,-0.482436,-0.400248,-0.49947
896,-0.62682,-0.417453,0.619896,0.616992


In [61]:
df_test_set = num_cols_df.join(ohe_cat_cols_df)
df_test_set.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                418 non-null    float64
 1   Fare               418 non-null    float64
 2   Parch              418 non-null    float64
 3   SibSp              418 non-null    float64
 4   Pclass_1           418 non-null    float64
 5   Pclass_2           418 non-null    float64
 6   Pclass_3           418 non-null    float64
 7   Sex_female         418 non-null    float64
 8   Sex_male           418 non-null    float64
 9   Embarked_C         418 non-null    float64
 10  Embarked_Q         418 non-null    float64
 11  Embarked_S         418 non-null    float64
 12  NamePrefix_Col     418 non-null    float64
 13  NamePrefix_Dr      418 non-null    float64
 14  NamePrefix_Master  418 non-null    float64
 15  NamePrefix_Miss    418 non-null    float64
 16  NamePrefix_Mr      418 

In [62]:
df_test_set.to_csv('df_test_set.csv')