# Titanic Dataset

About Dataset: 
Format/columns of dataset

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked

Survival 0 = No , 1 = Yes

sibsp = Number of siblings/spouses

parch = Number of parents/children

embarked = Port of Embarkation : C = Cherbourg, Q = Queenstown, S = Southampton

pclass = socio-economic status : 1st = Upper , 2nd = Middle ,3rd = Lower





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Dataset

In [3]:
train = pd.read_csv('drive/My Drive/datasets/titanic_train.txt')
test = pd.read_csv('drive/My Drive/datasets/titanic_test.txt')
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


**Number of people survived/not survived**

In [4]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

**Checking for null values**

In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
train['Ticket'].nunique()

681

In [9]:
train[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [10]:
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [11]:
train[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [12]:
train[["Embarked", "Survived"]].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


# Data Preprocessing

In [13]:
train_copy = train.copy()
test_copy = test.copy()

In [14]:
# Removing unnecassary columns
train = train.drop(['PassengerId', 'Ticket'], axis = 1)
print(train.columns)

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')


In [15]:
test = test.drop(['PassengerId', 'Ticket',], axis = 1)
print(test.columns)

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')


In [16]:
combine = [train, test]

In [17]:
train['Cabin'].fillna('U', inplace=True)
train['Cabin'] = train['Cabin'].apply(lambda x: x[0])

test['Cabin'].fillna('U', inplace=True)
test['Cabin'] = test['Cabin'].apply(lambda x: x[0])

train['Cabin'].unique()

array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [18]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,U,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,U,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,U,S


In [19]:
for dataset in combine:
  dataset['Cabin'] = dataset['Cabin'].fillna('U')
  dataset['Cabin'] = dataset['Cabin'].apply(lambda x: x[0])
  
pd.crosstab(train['Cabin'], train['Survived'])

Survived,0,1
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8,7
B,12,35
C,24,35
D,8,25
E,8,24
F,5,8
G,2,2
T,1,0
U,481,206


In [20]:
train[['Cabin', 'Survived']].groupby(['Cabin'], as_index = False).mean().sort_values(by = 'Survived', ascending = True)

Unnamed: 0,Cabin,Survived
7,T,0.0
8,U,0.299854
0,A,0.466667
6,G,0.5
2,C,0.59322
5,F,0.615385
1,B,0.744681
4,E,0.75
3,D,0.757576


In [21]:
cabin_mapping = {"T": 0, "U": 1, "A": 2, "G": 3, "C": 4, "F": 5, "B": 6, "E": 7, "D": 8}
for dataset in combine:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
    dataset['Cabin'] = dataset['Cabin'].fillna(0)

train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,1,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,4,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,4,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1,S


In [22]:
for dataset in combine:
    dataset['Title'] = dataset['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())

pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0
Mlle,2,0


In [23]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = dataset['Title'].replace('Sir', 'Mr')
    dataset['Title'] = dataset['Title'].replace('Dr', 'Mr')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.161905
3,Mrs,0.793651
4,Rare,0.214286
5,the Countess,1.0


In [24]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,1,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,4,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,4,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1,S,Mr


In [25]:
title_mapping = {"Mrs": 4, "Miss": 3, "Mr": 1, "Master": 2, "Rare": 0}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,1,S,1.0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,4,C,4.0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1,S,3.0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,4,S,4.0
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1,S,1.0


In [26]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,1,S,1.0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,4,C,4.0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,1,S,3.0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,4,S,4.0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,1,S,1.0


**Resolving Missing Values**

In [27]:
# Imputing Missing Values
train['Age'].fillna(train['Age'].dropna().median(), inplace=True)
test['Age'].fillna(train['Age'].mean(), inplace = True)
test['Fare'].fillna(train['Fare'].dropna().median(), inplace = True)
train['Embarked'].fillna('C', inplace = True)
test['Embarked'].fillna('C', inplace = True)

In [28]:
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.344168
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [29]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 4
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 0
    
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,AgeBand
0,0,3,"Braund, Mr. Owen Harris",0,1.0,1,0,7.25,1,S,1.0,"(16.336, 32.252]"
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,71.2833,4,C,4.0,"(32.252, 48.168]"
2,1,3,"Heikkinen, Miss. Laina",1,1.0,0,0,7.925,1,S,3.0,"(16.336, 32.252]"
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,53.1,4,S,4.0,"(32.252, 48.168]"
4,0,3,"Allen, Mr. William Henry",0,2.0,0,0,8.05,1,S,1.0,"(32.252, 48.168]"


In [30]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [31]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,AgeBand,FamilySize
0,0,3,"Braund, Mr. Owen Harris",0,1.0,1,0,7.25,1,0,1.0,"(16.336, 32.252]",2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,71.2833,4,1,4.0,"(32.252, 48.168]",2
2,1,3,"Heikkinen, Miss. Laina",1,1.0,0,0,7.925,1,0,3.0,"(16.336, 32.252]",1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,53.1,4,0,4.0,"(32.252, 48.168]",2
4,0,3,"Allen, Mr. William Henry",0,2.0,0,0,8.05,1,0,1.0,"(32.252, 48.168]",1


In [32]:
train['FareBand'] = pd.qcut(train['Fare'], 4)
train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [33]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train = train.drop(['FareBand'], axis=1)
combine = [train, test]  
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,AgeBand,FamilySize
0,0,3,"Braund, Mr. Owen Harris",0,1.0,1,0,0,1,0,1.0,"(16.336, 32.252]",2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2.0,1,0,3,4,1,4.0,"(32.252, 48.168]",2
2,1,3,"Heikkinen, Miss. Laina",1,1.0,0,0,1,1,0,3.0,"(16.336, 32.252]",1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2.0,1,0,3,4,0,4.0,"(32.252, 48.168]",2
4,0,3,"Allen, Mr. William Henry",0,2.0,0,0,1,1,0,1.0,"(32.252, 48.168]",1


**Dropping non relevant columns**

In [34]:
train = train.drop(['AgeBand', 'Name', 'SibSp', 'Parch' ], axis = 1)
test = test.drop(['Name', 'SibSp', 'Parch'], axis = 1)

In [38]:
# splitting the dataset into x(independent variables) and y(dependent variables)
x_train = train.drop('Survived', axis = 1)
y_train = train.Survived

print(x_train.shape)
print(y_train.shape)

(891, 8)
(891,)


In [39]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,0,3,0,1.0,0,1,0,1.0,2
1,1,1,1,2.0,3,4,1,4.0,2
2,1,3,1,1.0,1,1,0,3.0,1
3,1,1,1,2.0,3,4,0,4.0,2
4,0,3,0,2.0,1,1,0,1.0,1


In [40]:
x_test = test
print(x_test.shape)

(418, 8)


**Scaling data**

In [41]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
print(pca.explained_variance_ratio_)

[0.33922101 0.2096452  0.16874812 0.11269072 0.08894929 0.05149289
 0.02183786 0.00741492]


In [43]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 5)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
print(pca.explained_variance_ratio_)

[0.33922101 0.2096452  0.16874812 0.11269072 0.08894929]


In [44]:
x_train

array([[-1.90386147, -0.16874302,  0.32791026, -0.51782161,  0.22073179],
       [ 3.00663252, -0.13171522, -1.28605086,  0.46436509, -0.04010024],
       [-0.04875718, -1.57226679, -0.77558194, -1.2972408 , -0.11996154],
       ...,
       [ 0.90242887, -1.76222116,  0.44071742, -0.8645806 ,  1.11984568],
       [ 0.21098649,  1.96279915, -1.18041621,  0.85647786,  0.48103077],
       [-2.08836924, -0.54849005, -1.36075869,  2.13493504,  0.23086413]])

# Machine Learning Models

**Logistic Regression**

In [45]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('Training Accuracy :', model.score(x_train, y_train))

Training Accuracy : 0.8092031425364759


In [46]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cross_validation = StratifiedKFold(n_splits = 5)
cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = cross_validation)
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8024731655263324
Mean Standard Deviation:  0.011378470011165464


**AdaBoost Classifier**

In [47]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.8406285072951739


In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = StratifiedKFold(n_splits = 5))
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.7912874270290629
Mean Standard Deviation:  0.023769465913831115


**Gradient Boosting**

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.8866442199775533


In [50]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = StratifiedKFold(n_splits = 5))
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8013495700207145
Mean Standard Deviation:  0.015280695999664674


**Decision Tree Classifier**

In [51]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.9057239057239057


In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8036204744069912
Mean Standard Deviation:  0.04458187591737531


**Extra Tree Classifier**

In [54]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy: ", model.score(x_train, y_train))

Training Accuracy:  0.9057239057239057


In [55]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8125842696629213
Mean Standard Deviation:  0.034009140516660874


**Random Forest Classifier**

In [56]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.9057239057239057


In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

cross_validation = StratifiedKFold(n_splits = 5)

param_grid = [{'max_depth': [4, 6, 8], 
                'max_features': ['sqrt', 'auto', 'log2'], 
                'n_estimators': [50, 10],
               'min_samples_split': [2, 3, 10],
               'min_samples_leaf': [1, 3, 10],
               'bootstrap': ['True', 'False']}]

grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = cross_validation, verbose = 1)
grid_search.fit(x_train, y_train)

print("Best Accuracy: ", grid_search.best_score_)
print("Best Parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Accuracy:  0.8226790534178645
Best Parameters:  {'bootstrap': 'False', 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 10}


[Parallel(n_jobs=1)]: Done 1620 out of 1620 | elapsed:  1.4min finished


In [58]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(bootstrap = 'True', max_depth = 6, max_features = 'log2', min_samples_leaf = 3, min_samples_split = 10, n_estimators = 10)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.8619528619528619


In [59]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8182022471910113
Mean Standard Deviation:  0.035338105770964356


**XG Boost**

In [60]:
from xgboost.sklearn import XGBClassifier

model = XGBClassifier(max_depth = 4)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Training Accuracy :", model.score(x_train, y_train))

Training Accuracy : 0.8866442199775533


In [61]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10)
print('Mean Accuracy:', cvs.mean())
print('Mean Standard Deviation: ', cvs.std())

Mean Accuracy: 0.8193133583021222
Mean Standard Deviation:  0.03455711294630714
