### Titanic Survival Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/sumathi16/Datasets/master/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Creating a new column Family in the dataset  by adding SibSp column and Parch column
df['Family'] = df['SibSp'] + df['Parch']
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [4]:
# Ticket--> As it is just a random number, we can skip this column from our analysis
df.drop('Ticket',axis=1,inplace=True)

In [5]:
## Imputing with missing values
df.isnull().mean().sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
Family         0.000000
Fare           0.000000
Parch          0.000000
SibSp          0.000000
Gender         0.000000
Name           0.000000
Pclass         0.000000
Survived       0.000000
PassengerId    0.000000
dtype: float64

In [6]:
# Removing Cabin column because 77% are missing value
df.drop('Cabin',axis=1,inplace=True)

In [7]:
# Median---> sorting in the order (middle value) NaN in the age column should be substituted with median of age
df.Age.fillna(df.Age.median(), inplace=True)
df.Age.isnull().sum()

0

In [8]:
## Imputing the Embarked Column---> Mode Imputation
df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)
df.Embarked.isnull().sum()

0

In [9]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Gender         0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Family         0
dtype: int64

In [10]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Gender          object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
Family           int64
dtype: object

In [11]:
## Remove passenger ID
df.drop('PassengerId',axis=1,inplace=True)

In [12]:
df.drop('Name',axis=1,inplace=True)

In [13]:
df.nunique()

Survived      2
Pclass        3
Gender        2
Age          88
SibSp         7
Parch         7
Fare        248
Embarked      3
Family        9
dtype: int64

In [14]:
## Seperating the Input and Output Data, dropping the Survived column from the data
X = df.drop('Survived', axis=1)
X.shape

(891, 8)

In [15]:
X= pd.get_dummies(X)
X.shape

(891, 11)

In [16]:
## Output Column
y = df.iloc[:,0]
y.shape


(891,)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23) 
                                                        # random_state = to select the constant rows


In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(668, 11)
(223, 11)
(668,)
(223,)


In [19]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression() # creating an object for Logistic Regression
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
## accuracy_score---> With help of this metric, we can evaluate the overall 
## performance of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_train_pred)

0.8098802395209581

[DOCUMENT](https://docs.google.com/document/d/1FlNFwN57ySHI1hvq_21ZX0qYDOdQpmEhxPSnYMQUI1c/edit#heading=h.5fxfnpeqe7fs)

[ENSEMBLE LEARNING - BOOSTING](https://docs.google.com/document/d/1zLoByQkZKMiRsuX6Kn9lH-N6rv8qr5PsKfrgcSrRw7o/edit)

In [23]:
# importing the package 
from sklearn.ensemble import AdaBoostClassifier
# instance creation
adc=AdaBoostClassifier()
## training the model
adc.fit(X_train,y_train)
# prediction
y_pred=adc.predict(X_test)

In [24]:
accuracy_score(y_test,y_pred)

0.8071748878923767

In [25]:
accuracy_score(y_train,adc.predict(X_train))

0.8413173652694611

In [None]:
#from sklearn.metrics import classification_report
#classification_report()

In [26]:
adc

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

###### trying with different parameters
EX:
learning rate=0.5
n_estimators=25


In [27]:
# importing the package 
from sklearn.ensemble import AdaBoostClassifier
# instance creation
adc=AdaBoostClassifier(learning_rate = 1,n_estimators= 75)
## training the model
adc.fit(X_train,y_train)
# prediction
y_pred=adc.predict(X_test)
print("test accuracy:",accuracy_score(y_test,y_pred))
print("train accuracy",accuracy_score(y_train,adc.predict(X_train)))

test accuracy: 0.7982062780269058
train accuracy 0.8398203592814372


###### With base learner

In [28]:
# importing the package 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
# instance creation
adc=AdaBoostClassifier(base_estimator=LogisticRegression())
## training the model
adc.fit(X_train,y_train)
# prediction
y_pred=adc.predict(X_test)
print("test accuracy:",accuracy_score(y_test,y_pred))
print("train accuracy",accuracy_score(y_train,adc.predict(X_train)))

test accuracy: 0.7623318385650224
train accuracy 0.8068862275449101


In [34]:
# import the algo
from sklearn.ensemble import GradientBoostingClassifier
gdc = GradientBoostingClassifier()
gdc.fit(X_train,y_train)
# prediction
y_pred = gdc.predict(X_test)
print("test accuarcy",accuracy_score(y_pred, y_test))
print("train accuarcy",accuracy_score(y_train,gdc.predict(X_train) ))

test accuarcy 0.8026905829596412
train accuarcy 0.905688622754491


In [35]:
#accuracy_score(y_train,adc.predict(X_train))

###### Scores with different parameters
```Sai Rohit
learning_rate=1
test accuracy:  0.8116591928251121
Train accuracy:  0.9835329341317365
Venkat
train :0.905688622754491
test :0.8026905829596412
SAI
n_estimators=40
test accuracy: 0.820627802690583
train accuracy: 0.8622754491017964```

In [36]:
#from sklearn.ensemble import GradientBoostingClassifier
#gdc=GradientBoostingClassifier()
#gdc.fit(X_train,y_train)
#y_pred=gdc.predict(X_test)
#print("test accuracy:",accuracy_score(y_test,y_pred))
#print("train accuracy",accuracy_score(y_train,adc.predict(X_train)))

In [40]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


# XGBOOST

In [31]:
import xgboost as xgb
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_test)
print("test accuracy:",accuracy_score(preds,y_test))
print("train accuracy",accuracy_score(y_train,xgb.predict(X_train)))

test accuracy: 0.7937219730941704
train accuracy 0.9730538922155688


In [32]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [33]:
### Check the documentation
help(xgb)

Help on XGBClassifier in module xgboost.sklearn object:

class XGBClassifier(XGBModel, sklearn.base.ClassifierMixin)
 |  XGBClassifier(objective='binary:logistic', **kwargs)
 |  
 |  Implementation of the scikit-learn API for XGBoost classification.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |      max_depth : int
 |          Maximum tree depth for base learners.
 |      learning_rate : float
 |          Boosting learning rate (xgb's "eta")
 |      verbosity : int
 |          The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |      objective : string or callable
 |          Specify the learning task and the corresponding learning objective or
 |          a custom objective function to be used (see note below).
 |      booster: string
 |          Specify which booster to use: gbtree, gblinear or dart.
 |      tree_method: string
 |          Specify which tree method to use.  Default to auto.  If this parameter
 |          is set to default, XGBoost will choose the mo

Try to visulize the metrics <br>
X axis --> the algo names<br>
y axis --> train accuarcy,test accuarcy<br>