In [None]:

# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
# load the dataset
train = pd.read_csv("../input/titanic/train.csv")
train.head()

In [None]:
# info
train.info()

In [None]:
# describe 
train.describe()

In [None]:
# 
plt.figure(figsize=(10,5))
sns.distplot(train['Age'])

**Most of people who are on the ship are between 20-40 years of age**

In [None]:
# How many people survived?
with plt.xkcd():
    fig = plt.figure(figsize=(20,1))
    sns.countplot(y='Survived', data=train);
    print(train.Survived.value_counts())
    
    y = train.Survived.value_counts()

    for index, count in enumerate(y):
        plt.text(count, index,
                 str(count))
    plt.title("Survived Count")

**Only 342 people has survived**

In [None]:
# Let's view the distribution of Sex
with plt.xkcd():
    plt.figure(figsize=(20, 5))
    sns.countplot(y="Sex", data=train)
    
     
    y = train.Sex.value_counts()

    for index, count in enumerate(y):
        plt.text(count, index,
                 str(count))
    
    plt.title("Sex")

In [None]:
sns.kdeplot(data=train['Age'], shade=True)

In [None]:
sns.displot(x='Age', hue='Survived', data=train, alpha=0.6)
plt.show()

## Preprocessing Data

In [None]:
null_values = train.isna().sum()
null_values[null_values>0]


In [None]:
train.drop("PassengerId", axis=1, inplace=True)

In [None]:
train.select_dtypes('object')

In [None]:
def preprocessing_data(df):
    
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label] = content.fillna(content.mean())
                
        if not pd.api.types.is_numeric_dtype(content):
            df[label] = pd.Categorical(content).codes+1
            
    return df

In [None]:
train = preprocessing_data(df=train)
train

In [None]:
train.drop(['Name','Ticket'], axis=1, inplace=True)

In [None]:
train

## Checking the Outliers

In [None]:
sns.boxplot(train['Age'])

## Modelling

In [None]:
# Import required libraries
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.over_sampling import SMOTE

# Import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report




In [None]:
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print("----Imabalanced Classification-----")
counter = Counter(y)
print(counter)

# transform
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
print("----Balanced Classification----")
counter = Counter(y)
print(counter)



In [None]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(n_jobs=-1)
model1.fit(X_train, y_train)

y_preds = model1.predict(X_test)
y_preds

model1.score(X_test, y_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model1, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
np.random.seed(42)
model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)

y_preds = model2.predict(X_test)
y_preds

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model2, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
## Gradient Boost
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(42)
model3 = GradientBoostingClassifier()
model3.fit(X_train, y_train)

y_preds = model3.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model3, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

model4 = KNeighborsClassifier()
model4.fit(X_train, y_train)

y_preds = model4.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model4, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
# LogisticRegression 
from sklearn.linear_model import LogisticRegression

np.random.seed(42)
model5 = LogisticRegression()
model5.fit(X_train, y_train)

y_preds = model5.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model5, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
## XGB
from xgboost import XGBClassifier

np.random.seed(42)
model6 = XGBClassifier()
model6.fit(X_train, y_train)

y_preds = model6.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model6, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
#Catboost
from catboost import CatBoostClassifier

np.random.seed(42)
model7 = CatBoostClassifier(verbose=0)
model7.fit(X_train, y_train)

y_preds = model7.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model1, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)

print("------Cross_validation_scores-----")
print(f"Accuracy Score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision Score : {precision_score(y_test, y_preds)}")
print(f"Recall score : {recall_score(y_test, y_preds)}")
print(f"F1 Score : {f1_score(y_test, y_preds)}")
print()
print(f"Classification Report : {classification_report(y_test, y_preds)}")
print()
print()
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")

In [None]:
y_preds = model7.predict(X_test)
y_preds


In [None]:
y_pred_proba = model7.predict_proba(X_test)
y_pred_proba = y_pred_proba[:,1]
y_pred_proba

In [None]:
#ROC Curve
# Calculate the roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds  = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, marker='*', label='roc_curve')
plt.legend()
plt.show()

## Import Test Set

In [None]:
test = pd.read_csv("../input/titanic/test.csv")
test.head()


In [None]:
test = test.drop(['Name', 'Ticket'], axis=1)

In [None]:
test = preprocessing_data(df=test)
test.head()

In [None]:
# MOdelling
np.random.seed(42)
model = CatBoostClassifier(verbose=0)
model.fit(X_train, y_train)

y_preds = model.predict(test)
y_preds

In [None]:
data = pd.DataFrame()
data['PassengerId'] = test['PassengerId']
data['Survived'] = y_preds
data.head()

In [None]:
data.to_csv('submission', index=False)