In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# printing some random rows
# df.sample(5)

In [4]:
# df.info()
# df.describe()
df.describe(include=object)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Fynney, Mr. Joseph J",male,347082,C23 C25 C27,S
freq,1,577,7,4,644


In [5]:
tm = df.groupby("Survived").mean()

In [6]:
tm

Unnamed: 0_level_0,PassengerId,Pclass,Age,SibSp,Parch,Fare
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,447.016393,2.531876,30.626179,0.553734,0.32969,22.117887
1,444.368421,1.950292,28.34369,0.473684,0.464912,48.395408


In [7]:
v1 = tm.iloc[0,:]
v2 = tm.iloc[1,:]
abs((v2-v1)/(v1+v2))

PassengerId    0.002971
Pclass         0.129755
Age            0.038706
SibSp          0.077914
Parch          0.170176
Fare           0.372661
dtype: float64

In [8]:
df = df.drop(["PassengerId", "Name", "Ticket"], axis='columns')

In [9]:
#df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [10]:
# Handling categorical values
# df['Sex'] = df['Sex'].replace(["male","female"],[0,1])
df['Cabin'] = df['Cabin'].isna()

In [11]:
# df.info()
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,True,S
1,1,1,female,38.0,1,0,71.2833,False,C
2,1,3,female,26.0,0,0,7.925,True,S
3,1,1,female,35.0,1,0,53.1,False,S
4,0,3,male,35.0,0,0,8.05,True,S


In [12]:
# One hot encoding for Embarked class
df = pd.get_dummies(df)

In [13]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,True,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,False,1,0,1,0,0
2,1,3,26.0,0,0,7.925,True,1,0,0,0,1
3,1,1,35.0,1,0,53.1,False,1,0,0,0,1
4,0,3,35.0,0,0,8.05,True,0,1,0,0,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Age           714 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Cabin         891 non-null bool
Sex_female    891 non-null uint8
Sex_male      891 non-null uint8
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: bool(1), float64(2), int64(4), uint8(5)
memory usage: 47.1 KB


In [15]:
tm = df.groupby("Survived").mean()
v1 = tm.iloc[0,:]
v2 = tm.iloc[1,:]
abs((v2-v1)/(v1+v2))

Pclass        0.129755
Age           0.038706
SibSp         0.077914
Parch         0.170176
Fare          0.372661
Cabin         0.185190
Sex_female    0.643977
Sex_male      0.455736
Embarked_C    0.331221
Embarked_Q    0.012168
Embarked_S    0.101449
dtype: float64

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [17]:
df['Age'][df['Age'].isna()] = df['Age'].median()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Cabin         891 non-null bool
Sex_female    891 non-null uint8
Sex_male      891 non-null uint8
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: bool(1), float64(2), int64(4), uint8(5)
memory usage: 47.1 KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
y = df['Survived']
df.drop('Survived', axis='columns',inplace=True)
x = df

In [87]:
# model = LogisticRegression()
# model = RandomForestClassifier()
model = XGBClassifier( max_depth=4,min_child_weight=3,gamma=0.1,subsample=0.86,
                     colsample_bytree = 0.75, reg_alpha=0)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

model.fit(x_train,y_train)

print(model.score(x_test, y_test))

y_pred = model.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

0.8770949720670391
Survived     :  57
Not Survived :  122
Accuracy :  87.70949720670392
[[109   9]
 [ 13  48]]


In [55]:
# # HyperParameter tunning
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 1, stop = 200, num = 20)]

# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']

# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 50, num = 11)]
# max_depth.append(None)

# # Minimum number of samples required to split a node
# min_samples_split = [2,3,5,8,10,12,15,17,20]

# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4,5,6,8,9,10]

# # Method of selecting samples for training each tree
# bootstrap = [True, False]


# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# print(random_grid)

In [56]:
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 4, verbose=2, random_state=42, n_jobs = -1)

# # Fit the random search model
# x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

# rf_random.fit(x_train,y_train)

# print(rf_random.score(x_test, y_test))

# y_pred = rf_random.predict(x_test)
# print("Survived     : ", sum(y_pred == 1))
# print("Not Survived : ", sum(y_pred == 0))
# print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
# cm = confusion_matrix(y_test, y_pred)
# print(cm)

In [58]:
# rf_random.best_params_

In [66]:
## Hyperparameter tunnig for xgboost using GridSearchCV
model = XGBClassifier()

random_grid = {'eta': [x for x in np.linspace(start = 0, stop = 0.4, num = 10)]}

rf_grid = GridSearchCV(estimator=model, param_grid=random_grid, cv=5)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

rf_grid.fit(x_train,y_train)

print(rf_grid.score(x_test, y_test))

y_pred = rf_grid.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n")
rf_grid.best_params_

0.8715083798882681
Survived     :  54
Not Survived :  125
Accuracy :  87.15083798882681
[[110   8]
 [ 15  46]]




{'eta': 0.0}

In [72]:
model = XGBClassifier(eta=0)

random_grid = {'max_depth':range(1,10,1)}

rf_grid = GridSearchCV(estimator=model, param_grid=random_grid, cv=5)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

rf_grid.fit(x_train,y_train)

print(rf_grid.score(x_test, y_test))

y_pred = rf_grid.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n")
rf_grid.best_params_

0.8659217877094972
Survived     :  55
Not Survived :  124
Accuracy :  86.59217877094973
[[109   9]
 [ 15  46]]




{'max_depth': 4}

In [73]:
model = XGBClassifier(eta=0, max_depth=4)

random_grid = {'min_child_weight':range(1,13,2)}

rf_grid = GridSearchCV(estimator=model, param_grid=random_grid, cv=5)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

rf_grid.fit(x_train,y_train)

print(rf_grid.score(x_test, y_test))

y_pred = rf_grid.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n")
rf_grid.best_params_

0.8659217877094972
Survived     :  55
Not Survived :  124
Accuracy :  86.59217877094973
[[109   9]
 [ 15  46]]




{'min_child_weight': 5}

In [76]:
model = XGBClassifier(eta=0, max_depth=4,min_child_weight=5)

random_grid = {'gamma':[i/10.0 for i in range(0, 5)]}

rf_grid = GridSearchCV(estimator=model, param_grid=random_grid, cv=5)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

rf_grid.fit(x_train,y_train)

print(rf_grid.score(x_test, y_test))

y_pred = rf_grid.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n")
rf_grid.best_params_

0.8659217877094972
Survived     :  55
Not Survived :  124
Accuracy :  86.59217877094973
[[109   9]
 [ 15  46]]




{'gamma': 0.1}

In [82]:
model = XGBClassifier(eta=0, max_depth=4,min_child_weight=5,gamma=0.1,subsample=0.8,
                     colsample_bytree = 0.75, reg_alpha=0)

# random_grid = {'reg_alpha':[0,0.001,0.005,0.01,0.05]}

# rf_grid = GridSearchCV(estimator=model, param_grid=random_grid, cv=5)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 11)

rf_grid.fit(x_train,y_train)

print(rf_grid.score(x_test, y_test))

y_pred = rf_grid.predict(x_test)
print("Survived     : ", sum(y_pred == 1))
print("Not Survived : ", sum(y_pred == 0))
print("Accuracy : ", (accuracy_score(y_test, y_pred))*100)
#print("\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\n")
# rf_grid.best_params_

0.8770949720670391
Survived     :  57
Not Survived :  122
Accuracy :  87.70949720670392
[[109   9]
 [ 13  48]]




In [83]:
model.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75, eta=0, gamma=0.1,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)>