In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_title = []
for name in train['Name']:
    train_title.append((name[name.find(',')+1:name.find('.')+1]).strip())

test_title = []
for name in test['Name']:
    test_title.append((name[name.find(',')+1:name.find('.')+1]).strip())

In [6]:
train['title'] = train_title
test['title'] = test_title

In [7]:
train['title'].value_counts()

Mr.              517
Miss.            182
Mrs.             125
Master.           40
Dr.                7
Rev.               6
Mlle.              2
Major.             2
Col.               2
the Countess.      1
Capt.              1
Ms.                1
Sir.               1
Lady.              1
Mme.               1
Don.               1
Jonkheer.          1
Name: title, dtype: int64

In [8]:
test['title'].value_counts()

Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Ms.          1
Dr.          1
Dona.        1
Name: title, dtype: int64

In [9]:
def Name_Title_Code(x):
    if (x == 'Mr.') or (x == 'Col.') or (x == 'Capt.') or (x == 'Major.'):
        return 1
    if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme'):
        return 2
    if x == 'Miss.':
        return 3
    if x == 'Master.':
        return 4
    if x == 'Rev.':
        return 5
    return 6

In [10]:
train['title'] = train['title'].apply(Name_Title_Code)
test['title'] = test['title'].apply(Name_Title_Code)

In [11]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [12]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2


In [13]:
PassengerId = test['PassengerId'].values

In [14]:
train.drop(['PassengerId', 'Name'], axis=1,inplace=True)
test.drop(['PassengerId', 'Name'],axis=1, inplace=True)

In [15]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3
3,1,1,female,35.0,1,0,113803,53.1,C123,S,2
4,0,3,male,35.0,0,0,373450,8.05,,S,1


In [16]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
title         0
dtype: int64

In [17]:
test.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
title         0
dtype: int64

# Fare column

In [18]:
test[test["Fare"].isnull()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
152,3,male,60.5,0,0,3701,,,S,1


In [19]:
train[(train['Sex']=='male')&(train['Pclass']==3)&(train['SibSp']==0)&(train['Parch']==0)&(train['title']==1)&(train['Embarked']=="S")]['Fare'].median()

7.8958

In [20]:
test[(test['Sex']=='male')&(test['Pclass']==3)&(test['SibSp']==0)&(test['Parch']==0)&(test['title']==1)&(test['Embarked']=="S")]['Fare'].median()

7.8958

In [21]:
test['Fare'].fillna(7.8958, inplace=True)

In [22]:
test['Fare'].isnull().sum()

0

# Embarked column

In [23]:
train[train["Embarked"].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
61,1,1,female,38.0,0,0,113572,80.0,B28,,3
829,1,1,female,62.0,0,0,113572,80.0,B28,,2


In [24]:
train['Embarked'].fillna("S", inplace=True)

In [25]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
title         0
dtype: int64

In [26]:
test.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       327
Embarked      0
title         0
dtype: int64

In [27]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3
3,1,1,female,35.0,1,0,113803,53.1,C123,S,2
4,0,3,male,35.0,0,0,373450,8.05,,S,1


# Age column

In [28]:
train.groupby(['title'])['Age'].median()

title
1    30.0
2    35.0
3    21.0
4     3.5
5    46.5
6    40.0
Name: Age, dtype: float64

In [29]:
train.groupby(['title'])['Age'].mean()

title
1    32.668734
2    35.723214
3    21.773973
4     4.574167
5    43.166667
6    39.636364
Name: Age, dtype: float64

In [30]:
test.groupby(['title'])['Age'].median()

title
1    29.0
2    36.5
3    22.0
4     7.0
5    35.5
6    46.0
Name: Age, dtype: float64

In [31]:
test.groupby(['title'])['Age'].mean()

title
1    32.194595
2    38.903226
3    21.774844
4     7.406471
5    35.500000
6    46.000000
Name: Age, dtype: float64

In [32]:
train.loc[((train['title']==1)&(train['Age'].isna())),'Age']=30.0
train.loc[((train['title']==2)&(train['Age'].isna())),'Age']=35.0
train.loc[((train['title']==3)&(train['Age'].isna())),'Age']=21.0
train.loc[((train['title']==4)&(train['Age'].isna())),'Age']=5
train.loc[((train['title']==5)&(train['Age'].isna())),'Age']=40.5
train.loc[((train['title']==6)&(train['Age'].isna())),'Age']=42.0

In [33]:
test.loc[((test['title']==1)&(test['Age'].isna())),'Age']=30.0
test.loc[((test['title']==2)&(test['Age'].isna())),'Age']=35.0
test.loc[((test['title']==3)&(test['Age'].isna())),'Age']=21.0
test.loc[((test['title']==4)&(test['Age'].isna())),'Age']=5
test.loc[((test['title']==5)&(test['Age'].isna())),'Age']=40.5
test.loc[((test['title']==6)&(test['Age'].isna())),'Age']=42.0

# Cabin Column

In [34]:
train['Cabin'] = train['Cabin'].str[0]

In [35]:
test['Cabin'] = test['Cabin'].str[0]

In [36]:
train['Cabin'].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [37]:
test['Cabin'].value_counts()

C    35
B    18
D    13
E     9
F     8
A     7
G     1
Name: Cabin, dtype: int64

In [38]:
train.loc[train['Cabin'].isnull(), 'has_cabin'] = 0
train.loc[train['Cabin'].notnull(), 'has_cabin'] = 1

In [39]:
test.loc[test['Cabin'].isnull(), 'has_cabin'] = 0
test.loc[test['Cabin'].notnull(), 'has_cabin'] = 1

In [40]:
train['Cabin'].fillna('Missing', inplace=True)
test['Cabin'].fillna('Missing', inplace=True)

In [41]:
train.groupby('Cabin')['Survived'].count()

Cabin
A           15
B           47
C           59
D           33
E           32
F           13
G            4
Missing    687
T            1
Name: Survived, dtype: int64

In [42]:
def Cabin_First_Letter_Code(x):
    if x == 'Missing':
        return 1
    if x == 'B':
        return 2
    if x == 'C':
        return 3
    if x == 'D':
        return 4  
    if x == 'E':
        return 5
    if x == 'F':
        return 6
    if x == 'A':
        return 7
    return 8

In [43]:
train['Cabin'] = train['Cabin'].apply(Cabin_First_Letter_Code)
test['Cabin'] =  test['Cabin'].apply(Cabin_First_Letter_Code)

# Family Column

In [44]:
train['Family'] = train['SibSp'] + train['Parch']
test['Family'] = test['SibSp'] + test['Parch']

In [45]:
def Family_size(number):
    if number==1:
        return "Alone"
    elif number>1 and number<5:
        return "Medium"
    else:
        return 'Large'

In [46]:
train['Family_type'] = train['Family'].apply(Family_size)
test['Family_type'] = test['Family'].apply(Family_size)

In [47]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,has_cabin,Family,Family_type
0,0,3,male,22.0,1,0,A/5 21171,7.25,1,S,1,0.0,1,Alone
1,1,1,female,38.0,1,0,PC 17599,71.2833,3,C,2,1.0,1,Alone
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,1,S,3,0.0,0,Large
3,1,1,female,35.0,1,0,113803,53.1,3,S,2,1.0,1,Alone
4,0,3,male,35.0,0,0,373450,8.05,1,S,1,0.0,0,Large


In [48]:
train.drop(columns=['SibSp', 'Parch', 'Ticket', 'Family'], inplace=True)
test.drop(columns=['SibSp', 'Parch', 'Ticket', 'Family'], inplace=True)

In [49]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,title,has_cabin,Family_type
0,0,3,male,22.0,7.25,1,S,1,0.0,Alone
1,1,1,female,38.0,71.2833,3,C,2,1.0,Alone
2,1,3,female,26.0,7.925,1,S,3,0.0,Large
3,1,1,female,35.0,53.1,3,S,2,1.0,Alone
4,0,3,male,35.0,8.05,1,S,1,0.0,Large


In [50]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,title,has_cabin,Family_type
0,3,male,34.5,7.8292,1,Q,1,0.0,Large
1,3,female,47.0,7.0,1,S,2,0.0,Alone
2,2,male,62.0,9.6875,1,Q,1,0.0,Large
3,3,male,27.0,8.6625,1,S,1,0.0,Large
4,3,female,22.0,12.2875,1,S,2,0.0,Medium


In [51]:
train = pd.get_dummies(train, columns = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'title', 'Family_type'], drop_first=True)
test = pd.get_dummies(test, columns = ['Pclass', 'Sex', 'Cabin', 'Embarked', 'title', 'Family_type'], drop_first=True)

In [52]:
train.columns

Index(['Survived', 'Age', 'Fare', 'has_cabin', 'Pclass_2', 'Pclass_3',
       'Sex_male', 'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6',
       'Cabin_7', 'Cabin_8', 'Embarked_Q', 'Embarked_S', 'title_2', 'title_3',
       'title_4', 'title_5', 'title_6', 'Family_type_Large',
       'Family_type_Medium'],
      dtype='object')

In [53]:
test.columns

Index(['Age', 'Fare', 'has_cabin', 'Pclass_2', 'Pclass_3', 'Sex_male',
       'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Cabin_7',
       'Cabin_8', 'Embarked_Q', 'Embarked_S', 'title_2', 'title_3', 'title_4',
       'title_5', 'title_6', 'Family_type_Large', 'Family_type_Medium'],
      dtype='object')

In [54]:
train.shape, test.shape

((891, 23), (418, 22))

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
scalar = StandardScaler()

In [57]:
scalar.fit(train[['Age', 'Fare']])

StandardScaler()

In [58]:
train[['Age', 'Fare']] = scalar.transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = scalar.transform(test[['Age', 'Fare']])

In [59]:
train.head()

Unnamed: 0,Survived,Age,Fare,has_cabin,Pclass_2,Pclass_3,Sex_male,Cabin_2,Cabin_3,Cabin_4,...,Cabin_8,Embarked_Q,Embarked_S,title_2,title_3,title_4,title_5,title_6,Family_type_Large,Family_type_Medium
0,0,-0.558323,-0.502445,0.0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,0.650188,0.786845,1.0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,1,-0.256196,-0.488854,0.0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
3,1,0.423592,0.42073,1.0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
4,0,0.423592,-0.486337,0.0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [60]:
test.head()

Unnamed: 0,Age,Fare,has_cabin,Pclass_2,Pclass_3,Sex_male,Cabin_2,Cabin_3,Cabin_4,Cabin_5,...,Cabin_8,Embarked_Q,Embarked_S,title_2,title_3,title_4,title_5,title_6,Family_type_Large,Family_type_Medium
0,0.385826,-0.490783,0.0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1.329975,-0.507479,0.0,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,2.462954,-0.453367,0.0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,-0.180664,-0.474005,0.0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,-0.558323,-0.401017,0.0,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1


In [61]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, 1:], train.iloc[:, 0], test_size=0.2)

# RandomForestClassifier

In [62]:
rf = RandomForestClassifier()

In [63]:
cross_val_score(rf, X_train, y_train,cv =10, scoring='accuracy').mean()

0.8033450704225352

# LogisticRegressionClassifier

In [64]:
lr = LogisticRegression()

In [65]:
cross_val_score(lr, X_train, y_train,cv =10, scoring='accuracy').mean()

0.8188575899843504

# DecisionTreeClassifier

In [66]:
dt = DecisionTreeClassifier()

In [67]:
cross_val_score(dt, X_train, y_train,cv =10, scoring='accuracy').mean()

0.7710289514866979

# SupportVectorClassifier

In [68]:
svc = SVC()

In [69]:
cross_val_score(svc, X_train, y_train,cv =10, scoring='accuracy').mean()

0.8371674491392801

# KneaighourClassifiers

In [70]:
knn = KNeighborsClassifier()

In [71]:
cross_val_score(knn, X_train, y_train,cv =10, scoring='accuracy').mean()

0.8203442879499218

# Voting Ensemble Techniqe

In [72]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier

In [73]:
estimators = [
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svc', SVC())
]

In [74]:
vc = VotingClassifier(estimators=estimators)

In [75]:
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [76]:
y_pred = vc.predict(X_test)

In [77]:
accuracy_score(y_test, y_pred)

0.776536312849162

# Bagging Ensebmble Techniqe

In [78]:
from sklearn.ensemble import BaggingClassifier

In [79]:
bagg = BaggingClassifier(
     base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True
)

In [80]:
score = cross_val_score(bagg, X_train, y_train,cv =10, scoring='accuracy')

In [81]:
score.mean()

0.8329225352112676

In [123]:
score

array([0.83333333, 0.79166667, 0.84507042, 0.77464789, 0.87323944,
       0.87323944, 0.83098592, 0.84507042, 0.81690141, 0.84507042])

# GridSearchCv

In [82]:
param_grid = {
    'n_estimators':[50,100,500],
    'max_samples':[0.25,0.3,0.4,0.5],
    'max_features':[0.2,0.4,0.5]
}

In [83]:
search = GridSearchCV(BaggingClassifier(n_jobs=-1), param_grid=param_grid,cv=5)

In [84]:
search.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=BaggingClassifier(n_jobs=-1),
             param_grid={'max_features': [0.2, 0.4, 0.5],
                         'max_samples': [0.25, 0.3, 0.4, 0.5],
                         'n_estimators': [50, 100, 500]})

In [85]:
y_pred = search.predict(X_test)

In [86]:
accuracy_score(y_test, y_pred)

0.7988826815642458

In [87]:
search.best_estimator_

BaggingClassifier(max_features=0.4, max_samples=0.25, n_estimators=100,
                  n_jobs=-1)

# Boosting Techniques

# 1.AdaBoost

In [88]:
from sklearn.ensemble import AdaBoostClassifier

In [89]:
param_grid = {
    "n_estimators":[25, 40, 50, 75, 80, 100],
    "learning_rate":[0.1, 0.2, 0.5, 0.75, 1.0],
    "algorithm":["SAMME", "SAMME.R"],
}

In [90]:
grid = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=param_grid, cv=5)

In [91]:
grid.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.1, 0.2, 0.5, 0.75, 1.0],
                         'n_estimators': [25, 40, 50, 75, 80, 100]})

In [92]:
pred = grid.predict(X_test)

In [93]:
accuracy_score(y_test,pred)

0.7877094972067039

In [94]:
grid.best_estimator_

AdaBoostClassifier(n_estimators=40)

In [95]:
grid.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 40}

# 2. GradientBoosting

In [96]:
from sklearn.ensemble import GradientBoostingClassifier

In [97]:
param_grid = {
    "n_estimators":[25, 20, 40, 50,  70, 75, 80, 100],
    "learning_rate":[0.1, 0.2, 0.45, 0.5, 0.75, 1.0],
    "max_leaf_nodes":[8, 10, 20, 25, 30],
}

In [98]:
Gradient_grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=5)

In [99]:
Gradient_grid.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.1, 0.2, 0.45, 0.5, 0.75, 1.0],
                         'max_leaf_nodes': [8, 10, 20, 25, 30],
                         'n_estimators': [25, 20, 40, 50, 70, 75, 80, 100]})

In [100]:
prediction = Gradient_grid.predict(X_test)

In [101]:
accuracy_score(y_test, prediction)

0.8100558659217877

In [102]:
Gradient_grid.best_params_

{'learning_rate': 0.45, 'max_leaf_nodes': 8, 'n_estimators': 40}

# XGBoost Classifier

In [103]:
from xgboost import XGBClassifier

In [104]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [105]:
clf = XGBClassifier()

In [106]:
clf.fit(X_train , y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [107]:
y_prediction = clf.predict(X_test)

In [108]:
accuracy_score(y_test, y_prediction)

0.8044692737430168

In [109]:
XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=None, verbosity=None)

In [112]:
parm_grid = {
    "n_estimators":[200,300, 500, 750, 1000 ],
    "learning_rate":[0.01, 0.015, 0.20, 0.25, 0.5, 0.75, 1.0],
    'gamma':[0,0.1,0.2,0.5]
    }

In [113]:
xg_grid = GridSearchCV(estimator=XGBClassifier(), param_grid=parm_grid, cv=10)

In [114]:
xg_grid.fit(X_train, y_train)









































































































GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weigh

In [115]:
pred2 = xg_grid.predict(X_test)

In [116]:
accuracy_score(y_test, pred2)

0.8044692737430168

In [117]:
xg_grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0.5, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [118]:
xg_grid.best_params_

{'gamma': 0.5, 'learning_rate': 0.01, 'n_estimators': 500}

In [119]:
bagg.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500)

In [120]:
pred3 = bagg.predict(X_test)

In [121]:
accuracy_score(y_test, pred3)

0.8044692737430168

# RandomForest Classifier

In [124]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
rf_clf = RandomForestClassifier(max_depth=8, max_features=0.5, max_samples=0.5,
                       n_estimators=50)

In [126]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, max_features=0.5, max_samples=0.5,
                       n_estimators=50)

In [127]:
y_pred = rf_clf.predict(X_test)

In [128]:
accuracy_score(y_test, y_pred)

0.8044692737430168

In [129]:
score = cross_val_score(rf_clf, X_train , y_train, cv=10)

In [130]:
score

array([0.84722222, 0.79166667, 0.83098592, 0.76056338, 0.83098592,
       0.85915493, 0.83098592, 0.85915493, 0.84507042, 0.83098592])

In [131]:
score.mean()

0.828677621283255

In [133]:
xf = test.values

In [134]:
final_prediction = xg_grid.predict(xf)

In [135]:
final = pd.DataFrame()

In [137]:
final['PassengerId'] = PassengerId
final['Survived'] = final_prediction

In [138]:
final

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [139]:
final.to_csv('Submission.csv', index=False)

In [140]:
rf_ = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',  
                             random_state=10,
                             n_jobs=-1)

In [142]:
score = cross_val_score(rf_, X_train, y_train, cv=10)

In [143]:
score.mean()

0.8259194053208138

In [146]:
rf_.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=16, n_estimators=700, n_jobs=-1,
                       random_state=10)

In [147]:
final_prediction = rf_.predict(xf)

In [148]:
final['PassengerId'] = PassengerId
final['Survived'] = final_prediction

In [149]:
final.to_csv('Submission1.csv', index=False)