In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv(r"data\train.csv", header=0,delimiter=",", quoting=1)
# train, val = train_test_split(data)
test = pd.read_csv(r"data\test.csv", header=0,delimiter=",", quoting=1)

In [None]:
data.describe(include="all")

In [None]:
data.info()

In [30]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
Emb              0
FamilySize       0
FamSize          0
AgeBands         0
dtype: int64

In [29]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Title            0
Emb              0
FamilySize       0
FamSize          0
AgeBands         0
dtype: int64

In [None]:
survived = data[data["Survived"]==1]
not_survived = data[data["Survived"]==0]
print(len(survived),len(not_survived),len(data))

In [None]:
data.groupby("Pclass").Survived.mean()

In [18]:
data["Title"]=data.Name.str.extract(r" ([A-z]\w+)\. ")
test["Title"]=test.Name.str.extract(r" ([A-z]\w+)\. ")

In [24]:
test_Titles = test["Title"].value_counts()
data_Titles = data["Title"].value_counts()
data_survprob = data.groupby("Title").Survived.mean()

In [25]:
comp = pd.DataFrame({"Data": data_Titles,"Test":test_Titles, "Survival Rate": data_survprob})
print(comp)

           Data   Test  Survival Rate
Capt        1.0    NaN       0.000000
Col         2.0    2.0       0.500000
Countess    1.0    NaN       1.000000
Don         1.0    NaN       0.000000
Dona        NaN    1.0            NaN
Dr          7.0    1.0       0.428571
Jonkheer    1.0    NaN       0.000000
Lady        1.0    NaN       1.000000
Major       2.0    NaN       0.500000
Master     40.0   21.0       0.575000
Miss      182.0   78.0       0.697802
Mlle        2.0    NaN       1.000000
Mme         1.0    NaN       1.000000
Mr        517.0  240.0       0.156673
Mrs       125.0   72.0       0.792000
Ms          1.0    1.0       1.000000
Rev         6.0    2.0       0.000000
Sir         1.0    NaN       1.000000


In [None]:
pd.crosstab(data["Sex"],[data["Pclass"],data["Survived"]])

In [None]:
data[["Embarked","Survived"]].groupby("Embarked").mean()

In [None]:
data.Embarked.value_counts()

In [None]:
data.groupby("Pclass").Fare.describe(include="all")

In [None]:
test.isnull().sum()
# data.describe(include="all")
# data.info()

In [None]:
agebands = [0,1,2,3,4,5]
data["AgeBands"]=pd.cut(data["Age"],bins=[0,5,18,35,50,65,85],labels=agebands)
# data["AgeBands"]=pd.cut(data["Age"],bins=10)

In [None]:
data[["Survived","AgeBands"]].groupby("AgeBands").mean()

In [23]:
data["Title"]=data.Name.str.extract(r" ([A-z]\w+)\. ")
test["Title"]=test.Name.str.extract(r" ([A-z]\w+)\. ")


In [27]:
# Ms -> Miss, 
Titles = ["Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Others"]
TitleMap = {Titles[i]:i for i in range(len(Titles))}

In [28]:
train_test_data = [data, test]
for dataset in train_test_data:
    dataset["Sex"] = dataset["Sex"].map({"female":0, "male":1}).astype("category")
    dataset["Embarked"] = dataset.Embarked.fillna("S")
    dataset["Emb"] = dataset["Embarked"].map({"C":1,"Q":2,"S":3}).astype("category")
    dataset["Age"] = dataset["Age"].fillna(30)
    dataset["FamilySize"] = dataset.SibSp + dataset.Parch +1
    famlabels = [0,1,2]
    dataset["FamSize"] = pd.cut(dataset["FamilySize"],bins=[0,1,4,11],labels=famlabels)
    agebands = [0,1,2,3,4,5]
    dataset["AgeBands"]=pd.cut(dataset["Age"],bins=[0,5,18,35,50,65,85],labels=agebands)
    dataset["Pclass"] = dataset["Pclass"].astype("category")
    dataset["Title"] = dataset["Title"].replace(["Ms","Mlle","Mme"],"Miss")
    dataset["Title"] = dataset["Title"].apply(lambda title: title if title in Titles else "Others")
    dataset["Title"] = dataset["Title"].map(TitleMap)
    

In [73]:
# features = ["AgeBands","Sex","Emb","Pclass","FamSize","Title"]
features = ["Age","Emb","Pclass","FamSize","Title"]
data[features].isnull().sum()

Age        0
Emb        0
Pclass     0
FamSize    0
Title      0
dtype: int64

In [74]:
for col in features:
    print(data.groupby(col).Survived.mean())

Age
0.42     1.0
0.67     1.0
0.75     1.0
0.83     1.0
0.92     1.0
        ... 
70.00    0.0
70.50    0.0
71.00    0.0
74.00    0.0
80.00    1.0
Name: Survived, Length: 88, dtype: float64
Emb
1    0.553571
2    0.389610
3    0.339009
Name: Survived, dtype: float64
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
FamSize
0    0.303538
1    0.578767
2    0.161290
Name: Survived, dtype: float64
Title
0    0.156673
1    0.704301
2    0.792000
3    0.575000
4    0.428571
5    0.000000
6    0.500000
Name: Survived, dtype: float64


In [61]:
for feat in features:
    print(data[["Survived",feat]].groupby(feat).mean())

          Survived
AgeBands          
0         0.704545
1         0.410526
2         0.353271
3         0.398693
4         0.375000
5         0.125000
     Survived
Emb          
1    0.553571
2    0.389610
3    0.339009
        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
         Survived
FamSize          
0        0.303538
1        0.578767
2        0.161290
       Survived
Title          
0      0.156673
1      0.704301
2      0.792000
3      0.575000
4      0.428571
5      0.000000
6      0.500000


In [75]:
train_x, val_x, train_y, val_y = train_test_split(data[features],data["Survived"])

In [76]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier().fit(train_x,train_y)
val_acc_knn = round(clf_knn.score(val_x,val_y)*100,2)
train_acc_knn = round(clf_knn.score(train_x,train_y)*100,2)
print(val_acc_knn,train_acc_knn)

78.48 83.38


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [77]:
from sklearn.linear_model import LogisticRegression
clf_logreg = LogisticRegression().fit(train_x,train_y)
val_acc_logreg = round(clf_logreg.score(val_x,val_y)*100,2)
train_acc_logreg = round(clf_logreg.score(train_x,train_y)*100,2)
print(val_acc_logreg,train_acc_logreg)

82.06 74.7


In [78]:
from sklearn.svm import SVC
clf_svc = SVC().fit(train_x,train_y)
val_acc_svc = round(clf_svc.score(val_x,val_y)*100,2)
train_acc_svc = round(clf_svc.score(train_x,train_y)*100,2)
print(val_acc_svc,train_acc_svc)

63.68 62.43


In [79]:
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier().fit(train_x,train_y)
val_acc_dt = round(clf_dt.score(val_x,val_y)*100,2)
train_acc_dt = round(clf_dt.score(train_x,train_y)*100,2)
print(val_acc_dt,train_acc_dt)

78.03 93.26


In [80]:
clf_rf = RandomForestClassifier().fit(train_x,train_y)
val_acc_rf = round(clf_rf.score(val_x,val_y)*100,2)
train_acc_rf = round(clf_rf.score(train_x,train_y)*100,2)
print(val_acc_rf,train_acc_rf)

80.27 93.26


In [81]:
import xgboost as xgb
clf_xgb = xgb.XGBClassifier(tree_method="hist",enable_categorical=True,early_stopping_rounds=5)
clf_xgb.fit(train_x,train_y, eval_set=[(val_x,val_y)])
val_acc_xgb = round(clf_xgb.score(val_x,val_y)*100,2)
train_acc_xgb = round(clf_xgb.score(train_x,train_y)*100,2)
print(val_acc_xgb,train_acc_xgb)


[0]	validation_0-logloss:0.57000
[1]	validation_0-logloss:0.50716
[2]	validation_0-logloss:0.46929
[3]	validation_0-logloss:0.44839
[4]	validation_0-logloss:0.43935
[5]	validation_0-logloss:0.43151


[6]	validation_0-logloss:0.42778
[7]	validation_0-logloss:0.43103
[8]	validation_0-logloss:0.43464
[9]	validation_0-logloss:0.43513
[10]	validation_0-logloss:0.43744
81.61 86.08


In [83]:
from sklearn.ensemble import GradientBoostingClassifier
clf_gbc =GradientBoostingClassifier().fit(data[features],data["Survived"])
# clf_gbc =GradientBoostingClassifier().fit(train_x,train_y)
val_acc_gbc = round(clf_gbc.score(val_x,val_y)*100,2)
train_acc_gbc = round(clf_gbc.score(train_x,train_y)*100,2)
print(val_acc_gbc,train_acc_gbc)

89.24 87.43


In [84]:
# clf_knn = KNeighborsClassifier().fit(data[features],data["Survived"])
# clf = SVC().fit(data[features],data["Survived"])
clf = clf_gbc
result = clf.predict(test[features])

In [85]:
output = pd.DataFrame( data={"PassengerId":test["PassengerId"], "Survived":result} )
output.to_csv("Output\\"+"GCB_1.csv", index=False, quoting=3)

In [57]:
clf = RandomForestClassifier().fit(data[features],data["Survived"])
result = clf.predict(test[features])

In [None]:
train_x.info()

In [None]:
# candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# candidate_estimators = [5,10,25,50,100,250]
# t_b = np.inf
# b_e = 0
# for est in candidate_estimators:
#     m_n = 0
#     m_score = np.inf
#     scores = []
#     for nodes in candidate_max_leaf_nodes:
#         forest = RandomForestClassifier(max_leaf_nodes=nodes, n_estimators=est)
#         forest.fit(x,y)
#         score = forest.score(val_x,val_y)
#         scores.append(score)
#         if score<m_score:
#             m_n = nodes
#             m_score = score
#     if m_score<t_b:
#         t_b = m_score
#         b_e = est
#     print(est," estimators, ",m_n," nodes with score: ", m_score)
#     print(scores)
# print("Best config: ",b_e)

In [None]:
result = clf.predict(test[features])