## Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv(r"data\train.csv", header=0,delimiter=",", quoting=1)
# train, val = train_test_split(data)
test = pd.read_csv(r"data\test.csv", header=0,delimiter=",", quoting=1)

## Data Exploration

In [None]:
data.describe(include="all")

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
survived = data[data["Survived"]==1]
not_survived = data[data["Survived"]==0]
print(len(survived),len(not_survived),len(data))
print(data.Survived.mean())

In [None]:
data.groupby("Pclass").Survived.mean()

In [None]:
pd.crosstab(data["Sex"],[data["Pclass"],data["Survived"]])

In [None]:
data[["Embarked","Survived"]].groupby("Embarked").mean()

In [None]:
data.Embarked.value_counts()

In [None]:
data.groupby("Pclass").Fare.describe(include="all")

## Title

In [2]:
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["Title"]=dataset.Name.str.extract(r" ([A-z]\w+)\. ")

In [3]:
test_Titles = test["Title"].value_counts()
data_Titles = data["Title"].value_counts()
data_survprob = data.groupby("Title").Survived.mean()

In [4]:
comp = pd.DataFrame({"Data": data_Titles,"Test":test_Titles, "Survival Rate": data_survprob})
print(comp)

           Data   Test  Survival Rate
Capt        1.0    NaN       0.000000
Col         2.0    2.0       0.500000
Countess    1.0    NaN       1.000000
Don         1.0    NaN       0.000000
Dona        NaN    1.0            NaN
Dr          7.0    1.0       0.428571
Jonkheer    1.0    NaN       0.000000
Lady        1.0    NaN       1.000000
Major       2.0    NaN       0.500000
Master     40.0   21.0       0.575000
Miss      182.0   78.0       0.697802
Mlle        2.0    NaN       1.000000
Mme         1.0    NaN       1.000000
Mr        517.0  240.0       0.156673
Mrs       125.0   72.0       0.792000
Ms          1.0    1.0       1.000000
Rev         6.0    2.0       0.000000
Sir         1.0    NaN       1.000000


In [5]:
Titles = ["Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Others"]
TitleMap = {Titles[i]:i for i in range(len(Titles))}

In [6]:
for dataset in train_test_data:
    dataset["Title"] = dataset["Title"].replace(["Ms","Mlle","Mme"],"Miss")
    dataset["Title"] = dataset["Title"].apply(lambda title: title if title in Titles else "Others")
    dataset["Title"] = dataset["Title"].map(TitleMap)

## Missing Values

Fill missing data - Data Imputation, Drop columns

In [54]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Title          0
Emb            0
dtype: int64

In [53]:
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Title          0
Emb            0
dtype: int64

In [9]:
data.groupby("Pclass").Fare.describe(include="all")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,216.0,84.154687,78.380373,0.0,30.92395,60.2875,93.5,512.3292
2,184.0,20.662183,13.417399,0.0,13.0,14.25,26.0,73.5
3,491.0,13.67555,11.778142,0.0,7.75,8.05,15.5,69.55


In [10]:
test.groupby("Pclass").Fare.describe(include="all")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,107.0,94.280297,84.435858,0.0,30.1,60.0,134.5,512.3292
2,93.0,22.202104,13.991877,9.6875,13.0,15.75,26.0,73.5
3,217.0,12.459678,10.803698,3.1708,7.75,7.8958,14.4,69.55


In [16]:
test.loc[test.Fare.isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Emb


In [30]:
# Data Cleaning
train_test_data = [data, test]
for dataset in train_test_data:
    # dataset.drop("Cabin",axis=1,inplace=True)
    dataset["Embarked"] = dataset.Embarked.fillna("S")
    dataset["Fare"] = dataset.Fare.fillna(13.0)
    dataset["Emb"] = dataset["Embarked"].map({"C":1,"Q":2,"S":3}).astype("category")
    dataset["Sex"] = dataset["Sex"].map({"female":0, "male":1}).astype("category")

### Interpolate Age

In [31]:
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["Age"] = np.ceil(dataset["Age"])

In [32]:
features = ["Pclass","Sex","SibSp","Parch","Fare","Title","Emb"]

In [33]:
data_null_idx = data.Age.isnull()
test_null_idx = test.Age.isnull()
data_null = data.loc[data_null_idx][features]
data_X = data.loc[~data_null_idx][features]
data_Y = data.loc[~data_null_idx]["Age"]
test_null = test.loc[test_null_idx][features]
test_X = test.loc[~test_null_idx][features]
test_Y = test.loc[~test_null_idx]["Age"]


In [34]:
train_X = pd.concat([data_X,test_X])
train_Y = pd.concat([data_Y,test_Y])

In [44]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(max_iter=1000,early_stopping=True).fit(train_X.values, train_Y.values)

In [51]:
data_age = regr.predict(data_null.values)
test_age = regr.predict(test_null.values)

In [52]:
data.loc[data_null_idx,"Age"] = np.ceil(data_age)
test.loc[test_null_idx,"Age"] = np.ceil(test_age)

## Age Bands

In [None]:
agebands = [0,1,2,3,4,5]
data["AgeBands"]=pd.cut(data["Age"],bins=[0,5,18,35,50,65,85],labels=agebands)
# data["AgeBands"]=pd.cut(data["Age"],bins=10)

In [None]:
data[["Survived","AgeBands"]].groupby("AgeBands").mean()

## Family Size

In [55]:
famlabels = [0,1,2]
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["FamilySize"] = dataset.SibSp + dataset.Parch +1
    dataset["FamSize"] = pd.cut(dataset["FamilySize"],bins=[0,1,4,11],labels=famlabels)

## Final Data Manipulation

In [None]:
Titles = ["Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Others"]
TitleMap = {Titles[i]:i for i in range(len(Titles))}

In [None]:
famlabels = [0,1,2]
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["Title"] = dataset.Name.str.extract(r" ([A-z]\w+)\. ")
    dataset["Sex"] = dataset["Sex"].map({"female":0, "male":1}).astype("category")
    dataset["Embarked"] = dataset.Embarked.fillna("S")
    dataset["Emb"] = dataset["Embarked"].map({"C":1,"Q":2,"S":3}).astype("category")
    # dataset["Age"] = dataset["Age"].fillna(30)
    dataset["FamilySize"] = dataset.SibSp + dataset.Parch +1
    dataset["FamSize"] = pd.cut(dataset["FamilySize"],bins=[0,1,4,11],labels=famlabels)
    agebands = [0,1,2,3,4,5]
    dataset["AgeBands"]=pd.cut(dataset["Age"],bins=[0,5,18,35,50,65,85],labels=agebands)
    dataset["Pclass"] = dataset["Pclass"].astype("category")
    dataset["Title"] = dataset["Title"].replace(["Ms","Mlle","Mme"],"Miss")
    dataset["Title"] = dataset["Title"].apply(lambda title: title if title in Titles else "Others")
    dataset["Title"] = dataset["Title"].map(TitleMap)
    

In [56]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Emb,FamilySize,FamSize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,S,0,3,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C,2,1,2,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,S,1,3,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,S,2,3,2,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,S,0,3,1,0


In [60]:
features = ["Pclass","Sex","Age","Title","Emb","FamSize"]

In [61]:
# features = ["AgeBands","Sex","Emb","Pclass","FamSize","Title"]
# features = ["Age","Emb","Pclass","FamSize","Title"]
test[features].isnull().sum()

Pclass     0
Sex        0
Age        0
Title      0
Emb        0
FamSize    0
dtype: int64

In [59]:
for col in features:
    print(data.groupby(col).Survived.mean())

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
Sex
0    0.742038
1    0.188908
Name: Survived, dtype: float64
Age
1.0     0.857143
2.0     0.300000
3.0     0.833333
4.0     0.700000
5.0     1.000000
          ...   
66.0    0.000000
70.0    0.000000
71.0    0.000000
74.0    0.000000
80.0    1.000000
Name: Survived, Length: 70, dtype: float64
Title
0    0.156673
1    0.704301
2    0.792000
3    0.575000
4    0.428571
5    0.000000
6    0.500000
Name: Survived, dtype: float64
Emb
1    0.553571
2    0.389610
3    0.339009
Name: Survived, dtype: float64
FamSize
0    0.303538
1    0.578767
2    0.161290
Name: Survived, dtype: float64


## Classification

In [70]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [71]:
train_x, val_x, train_y, val_y = train_test_split(data[features],data["Survived"])

In [72]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = make_pipeline(StandardScaler(), KNeighborsClassifier())
clf_knn.fit(train_x,train_y)
val_acc_knn = round(clf_knn.score(val_x,val_y)*100,2)
train_acc_knn = round(clf_knn.score(train_x,train_y)*100,2)
print(val_acc_knn,train_acc_knn)

80.72 86.38


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [73]:
from sklearn.linear_model import LogisticRegression
clf_logreg = make_pipeline(StandardScaler(), LogisticRegression())
clf_logreg.fit(train_x,train_y)
val_acc_logreg = round(clf_logreg.score(val_x,val_y)*100,2)
train_acc_logreg = round(clf_logreg.score(train_x,train_y)*100,2)
print(val_acc_logreg,train_acc_logreg)

78.92 80.99


In [74]:
from sklearn.svm import SVC
clf_svc = make_pipeline(StandardScaler(),SVC())
clf_svc.fit(train_x,train_y)
val_acc_svc = round(clf_svc.score(val_x,val_y)*100,2)
train_acc_svc = round(clf_svc.score(train_x,train_y)*100,2)
print(val_acc_svc,train_acc_svc)

82.06 84.88


In [75]:
from sklearn.tree import DecisionTreeClassifier
clf_dt = make_pipeline(StandardScaler(),DecisionTreeClassifier())
clf_dt.fit(train_x,train_y)
val_acc_dt = round(clf_dt.score(val_x,val_y)*100,2)
train_acc_dt = round(clf_dt.score(train_x,train_y)*100,2)
print(val_acc_dt,train_acc_dt)

78.92 94.16


In [76]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = make_pipeline(StandardScaler(),RandomForestClassifier())
clf_rf.fit(train_x,train_y)
val_acc_rf = round(clf_rf.score(val_x,val_y)*100,2)
train_acc_rf = round(clf_rf.score(train_x,train_y)*100,2)
print(val_acc_rf,train_acc_rf)

79.82 94.16


In [84]:
import xgboost as xgb
clf_xgb = xgb.XGBClassifier(tree_method="hist",enable_categorical=True,early_stopping_rounds=10)
clf_xgb.fit(train_x,train_y, eval_set=[(val_x,val_y)])
val_acc_xgb = round(clf_xgb.score(val_x,val_y)*100,2)
train_acc_xgb = round(clf_xgb.score(train_x,train_y)*100,2)
print(val_acc_xgb,train_acc_xgb)


[0]	validation_0-logloss:0.57761
[1]	validation_0-logloss:0.52150
[2]	validation_0-logloss:0.49130
[3]	validation_0-logloss:0.46857
[4]	validation_0-logloss:0.46181
[5]	validation_0-logloss:0.45684
[6]	validation_0-logloss:0.45868
[7]	validation_0-logloss:0.46192
[8]	validation_0-logloss:0.46842
[9]	validation_0-logloss:0.47130
[10]	validation_0-logloss:0.47535
[11]	validation_0-logloss:0.47626
[12]	validation_0-logloss:0.47677
[13]	validation_0-logloss:0.47952
[14]	validation_0-logloss:0.47591
[15]	validation_0-logloss:0.47216
80.72 87.57


In [89]:
from sklearn.ensemble import GradientBoostingClassifier
clf_gbc =make_pipeline(StandardScaler(),GradientBoostingClassifier())
clf_gbc.fit(data[features],data["Survived"])
# clf_gbc.fit(train_x,train_y)
# clf_gbc =GradientBoostingClassifier().fit(train_x,train_y)
val_acc_gbc = round(clf_gbc.score(val_x,val_y)*100,2)
train_acc_gbc = round(clf_gbc.score(train_x,train_y)*100,2)
print(val_acc_gbc,train_acc_gbc)

85.65 88.32


In [93]:
# from sklearn.ensemble import GradientBoostingClassifier
# clf_gbc =GradientBoostingClassifier()
# # clf_gbc.fit(data[features],data["Survived"])
# clf_gbc.fit(train_x,train_y)
# # clf_gbc =GradientBoostingClassifier().fit(train_x,train_y)
# val_acc_gbc = round(clf_gbc.score(val_x,val_y)*100,2)
# train_acc_gbc = round(clf_gbc.score(train_x,train_y)*100,2)
# print(val_acc_gbc,train_acc_gbc)

80.27 89.52


## Output Result

In [90]:
# clf_knn = KNeighborsClassifier().fit(data[features],data["Survived"])
# clf = SVC().fit(data[features],data["Survived"])
# clf = clf_rf.fit(data[features],data["Survived"])
clf = clf_gbc
result = clf.predict(test[features])

In [91]:
output = pd.DataFrame( data={"PassengerId":test["PassengerId"], "Survived":result} )
output.to_csv("Output\\New\\"+"GBC.csv", index=False, quoting=3)

In [None]:
clf = RandomForestClassifier().fit(data[features],data["Survived"])
result = clf.predict(test[features])

In [None]:
train_x.info()

In [None]:
# candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# candidate_estimators = [5,10,25,50,100,250]
# t_b = np.inf
# b_e = 0
# for est in candidate_estimators:
#     m_n = 0
#     m_score = np.inf
#     scores = []
#     for nodes in candidate_max_leaf_nodes:
#         forest = RandomForestClassifier(max_leaf_nodes=nodes, n_estimators=est)
#         forest.fit(x,y)
#         score = forest.score(val_x,val_y)
#         scores.append(score)
#         if score<m_score:
#             m_n = nodes
#             m_score = score
#     if m_score<t_b:
#         t_b = m_score
#         b_e = est
#     print(est," estimators, ",m_n," nodes with score: ", m_score)
#     print(scores)
# print("Best config: ",b_e)

In [None]:
result = clf.predict(test[features])