In [None]:
#importing basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

In [None]:
train.head()

In [None]:
train.shape, test.shape

## Data Preprocessing on Train data

Removing columns that we don't need

In [None]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

Checking for null values

In [None]:
train.isna().sum()

We have null values in two columns. Let's take care of this problem.

Let's fill the null values of age columns with the mean values

In [None]:
train['Age'].fillna(train['Age'].mean(), inplace = True)

Now we have to take care of null values of Embarked column.

Let's first check which embarkation port we have most in our dataset.

In [None]:
train.Embarked.value_counts()

`Southampton` is the top port of embarkation. So, let's fill the null values with `S`

In [None]:
train['Embarked'].fillna('S', inplace = True)

Let's check again for null values.

In [None]:
train.isna().sum()

**Nice!**

We don't any null values now

# Data Exploration on Train set

Let's first check how many people survived

In [None]:
train.Survived.value_counts()

In [None]:
train.Survived.value_counts().plot(kind = 'bar', color = ['lightblue', 'lightgreen']);

Let's check how many male and female was there

In [None]:
train.Sex.value_counts()

In [None]:
train.Sex.value_counts().plot(kind = 'bar', color = ['skyblue', 'plum']);

let's check out survivors w.r.t sex

In [None]:
pd.crosstab(train.Sex, train.Survived)

In [None]:
pd.crosstab(train.Sex, train.Survived).plot(kind = 'bar', color = ['slategray', 'salmon']);

Survivors w.r.t pclass

In [None]:
pd.crosstab(train.Pclass, train.Survived)

In [None]:
pd.crosstab(train.Pclass, train.Survived).plot(kind = 'bar', color = ['slategray', 'lightcoral']);

Let's check the Port of Embarkation

In [None]:
train.Embarked.value_counts()

Let's look at our age column

In [None]:
sns.countplot(x = 'Embarked', data = train);

In [None]:
sns.displot(x = 'Age', data = train, color = 'cadetblue', kde = True);

In [None]:
sns.displot(x = 'Fare', data = train, kind = 'kde');

Let's now find a relation among age, survived and pclass columns

In [None]:
sns.lmplot(x = 'Age', y = 'Survived', hue = 'Pclass', data = train);

In [None]:
correlation_matrix = train.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, 
            annot=True, 
            linewidths=0.5, 
            fmt= ".2f", 
            cmap="YlGnBu");

# Feature Engineering in train data

In [None]:
train['family'] = train['SibSp'] + train['Parch']

In [None]:
train.head(10)

Removing skewness in `Age` column

In [None]:
train['Age']=np.log(train['Age']+1)

In [None]:
train['Age'].plot(kind = 'density', figsize=(10, 6));

Removing skewness in `Fare` column

In [None]:
train['Fare']=np.log(train['Fare']+1)

In [None]:
 train['Fare'].plot(kind = 'density', figsize=(10, 6));

In [None]:
train.head(10)

Let's create x and y matrix of features

In [None]:
x = train.drop('Survived',  axis = 1)
y = train['Survived']

In [None]:
x.shape

In [None]:
x.head()

We have two `categorical` columns. Let's take care of them now.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Sex', 'Embarked', 'Pclass']
onehotencode = OneHotEncoder()

transformer = ColumnTransformer([('Encoder', onehotencode, categorical_features)], remainder = 'passthrough')

encoded = transformer.fit_transform(x)

In [None]:
encoded_df = pd.DataFrame(encoded)

In [None]:
encoded_df.shape

In [None]:
encoded_df.head()

**Avoiding Dummy variables**

In [None]:
encoded_x = encoded_df.drop([0, 2, 5], axis = 1)

In [None]:
encoded_x.head()

In [None]:
encoded_x.shape

In [None]:
y.shape

# Feature Engineering in test data

In [None]:
test['family'] = test['SibSp'] + test['Parch']

In [None]:
test.head()

Removing skewness in `Age` column

In [None]:
test['Age']=np.log(test['Age']+1)

Removing skewness in `Fare` column

In [None]:
test['Fare']=np.log(test['Fare']+1)

In [None]:
test['Age'].plot(kind = 'density', figsize=(10, 6));

In [None]:
test['Fare'].plot(kind = 'density', figsize=(10, 6));

In [None]:
test.head(10)

# Preparing test set

In [None]:
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

In [None]:
test.head(10)

Checking for null values

In [None]:
test.isna().sum()

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace = True)
test['Fare'].fillna(test['Fare'].mean(), inplace = True)

In [None]:
test.isna().sum()

We succesfully removed all the null values

As before we now have to take care of `categorical columns`

In [None]:
categorical_features = ['Sex', 'Embarked', 'Pclass']
onehotencode = OneHotEncoder()

transformer = ColumnTransformer([('Encoder', onehotencode, categorical_features)], remainder = 'passthrough')

encoded_test = transformer.fit_transform(test)

In [None]:
encoded_test = pd.DataFrame(encoded_test)

In [None]:
encoded_test.head()

Avoiding dummy variable trap

In [None]:
encoded_test_x = encoded_test.drop([0, 2, 5], axis = 1)

In [None]:
encoded_test_x.head()

In [None]:
encoded_test_x.shape

# Modeling

Let's split our dataset

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(encoded_x,y,random_state = 31)

In [None]:
len(x_train), len(x_test), len(y_train), len(y_test)

In [None]:
x_train.shape

In [None]:
y_train.shape

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(max_iter = 1000, random_state = 4)
log_clf.fit(x_train, y_train)
log_score = log_clf.score(x_test, y_test)
log_score

### Logistic Regression Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
log_grid = {'C': np.logspace(-4, 4),
            'solver': ['liblinear'],
            'max_iter': np.arange(100, 2000, 100),
            'penalty':['l1', 'l2']
           }

log_gscv = GridSearchCV(LogisticRegression(max_iter = 1000, random_state = 7),
                          param_grid=log_grid,
                          cv=5,
                          verbose=True)

log_gscv.fit(x_train, y_train)
log_tuned_score = log_gscv.score(x_test, y_test)
log_tuned_score

In [None]:
log_gscv.best_params_

### Evaluating logistic regression model

In [None]:
from sklearn.metrics import classification_report
y_preds = log_clf.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(log_clf, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# Linear SVC

In [None]:
from sklearn import svm
svc_clf = svm.SVC(random_state = 7)
svc_clf.fit(x_train, y_train)
svc_score = svc_clf.score(x_test, y_test)
svc_score

### SVC Hyperparameter tuning

In [None]:
svc_grid = {'C': np.logspace(-5, 5, 20),
            'kernel': ['rbf'],
            'degree': [2,3,4]
           }

svc_gscv = GridSearchCV(svm.SVC(random_state = 31),
                        param_grid=svc_grid,
                        cv=5,
                        verbose=True)

svc_gscv.fit(x_train, y_train)
svc_tuned_score = log_gscv.score(x_test, y_test)
svc_tuned_score

In [None]:
svc_gscv.best_params_

### Evaluating with SVC 

In [None]:
y_preds = svc_clf.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(svc_clf, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)
knn_score = knn_clf.score(x_test, y_test)
knn_score

### KNeighbors Classifier Hyperparameter Tuning

In [None]:
knn_grid = {'n_neighbors': np.arange(2, 15),
            'leaf_size': [10, 15, 20, 25, 30, 35, 40, 45, 50],
            'p':[1,2,3,4,5], 
            'algorithm': ['auto', 'ball_tree', 'kd_tree']}

knn_gscv = GridSearchCV(KNeighborsClassifier(),
                        param_grid=knn_grid,
                        cv=5,
                        verbose=True)

knn_gscv.fit(x_train, y_train)
knn_tuned_score = knn_gscv.score(x_test, y_test)
knn_tuned_score

In [None]:
knn_gscv.best_params_

### Evaluating KNN model

In [None]:
y_preds = knn_clf.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(knn_clf, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(n_estimators=1000, random_state = 35)
rand_clf.fit(x_train, y_train)
ranf_score = rand_clf.score(x_test, y_test)
ranf_score

### Random Forest hyperparameter tuning

In [None]:
rfcv_grid = {"n_estimators": np.arange(500, 2000, 100),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

rfcv_clf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions = rfcv_grid,
                           cv=5,
                           n_iter = 100,
                           verbose=True)

rfcv_clf.fit(x_train, y_train)
ranf_tuned_score = rfcv_clf.score(x_test, y_test)
ranf_tuned_score

In [None]:
rfcv_clf.best_params_

### Evaluating Random Forest model

In [None]:
y_preds = rfcv_clf.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(rfcv_clf, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
gbc_score = gbc.score(x_test, y_test)
gbc_score

### GradientBoostingClassifier hyperparameter tuning

In [None]:
gbc_grid = {'loss': ['deviance', 'exponential'],
            'learning_rate': [0.1,0.01],
            'n_estimators': [100, 200, 500, 1000],
            'min_samples_split': [2, 4, 6, 8, 10],
            'min_samples_leaf': [1, 2, 3, 5],
            'max_depth': [1, 2, 3]}


gbc_clf = GridSearchCV(GradientBoostingClassifier(),
                      param_grid = gbc_grid,
                           cv=5,
                           verbose=True)
gbc_clf.fit(x_train, y_train)
gbc_tuned_score = gbc_clf.score(x_test, y_test)
gbc_tuned_score

In [None]:
gbc_clf.best_params_

### Evaluating gradient boosting model

In [None]:
y_preds = gbc.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(gbc, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# CatBoost

In [None]:
from catboost import CatBoostClassifier
cbc = CatBoostClassifier(random_seed = 31)
cbc.fit(x_train, y_train, verbose=False);
cbc_score = cbc.score(x_test, y_test);
cbc_score

### CatBoostClassifier hyperparameter tuning

In [None]:
cbc_grid = {'iterations':[10, 100, 200, 500, 1000],
            'learning_rate': [0.1, 0.01]}


cbc_clf = GridSearchCV(CatBoostClassifier(random_state = 31),
                      param_grid = cbc_grid,
                           cv=5,
                           verbose=True)

cbc_clf.fit(x_train, y_train, verbose=False)
cbc_tuned_score = cbc_clf.score(x_test, y_test)
cbc_tuned_score

In [None]:
cbc_clf.best_params_

### Evaluating CatBoost model

In [None]:
y_preds = cbc_clf.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(cbc_clf, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

# LGBM

In [None]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)
lgbm_score = lgbm.score(x_test, y_test)
lgbm_score

### LGBM hyperparameter tuning

In [None]:
lgbm_grid = {'num_leaves': [10, 15, 30, 31, 40, 45],
             'n_estimators':[10, 50, 100, 200],
             'learning_rate': [0.1, 0.01],
             'min_child_samples': [5, 10, 15, 20, 25]}


lgbm_clf = GridSearchCV(LGBMClassifier(random_state = 31),
                           param_grid = lgbm_grid,
                           cv=5,
                           verbose=True)

lgbm_clf.fit(x_train, y_train, verbose=False)
lgbm_tuned_score = lgbm_clf.score(x_test, y_test)
lgbm_tuned_score

In [None]:
lgbm_clf.best_params_

### Evaluating LGBM model

In [None]:
y_preds = lgbm.predict(x_test)
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(lgbm, x_test, y_test)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve');

Let's make a dictionary of all models and accuracy

In [None]:
score = [{'Model':'Logistic Regression', 'Score': log_score, 'Tuned_score': log_tuned_score}, 
         {'Model':'SVC', 'Score': svc_score, 'Tuned_score': svc_tuned_score},
         {'Model':'KNN', 'Score': knn_score, 'Tuned_score': knn_tuned_score},
         {'Model':'Random Forest', 'Score': ranf_score, 'Tuned_score': ranf_tuned_score},
         {'Model':'Gradient Boosting', 'Score': gbc_score, 'Tuned_score': gbc_tuned_score},
         {'Model':'CatBoost', 'Score': cbc_score, 'Tuned_score': cbc_tuned_score},
         {'Model':'LGBM', 'Score': lgbm_score, 'Tuned_score': lgbm_tuned_score}]

Let's view all model score as a dataframe to get a good overview

In [None]:
pd.DataFrame(score, columns=['Model','Score','Tuned_score'])

**Looks like SVC classifier is doing best. So, let's predict with this**.

In [None]:
final_preds = svc_clf.predict(encoded_test_x)

# Creating file for submission 

In [None]:
sub_data = pd.read_csv('../input/titanic/gender_submission.csv')
final_data = {'PassengerId': sub_data.PassengerId, 'Survived': final_preds}
final_submission = pd.DataFrame(data=final_data)
final_submission.to_csv('submission_file_titanic.csv',index =False)

**`If this notebook was useful to you. Don't forget to upvote. Thanks`**