In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

## Models
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb 
from keras.models import Sequential
from keras.layers import Dense
from sklearn.neural_network import MLPClassifier

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve


In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.drop(['Name', 'Ticket'], axis=1, inplace=True)
train_data.tail()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.drop(['Name', 'Ticket'], axis=1, inplace=True)
test_data.tail()

In [None]:
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Survived", hue='Pclass', data=train_data)
ax

In [None]:
import plotly.express as px
fig = px.box(train_data, x="Pclass", y="Age", color="Sex")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    Sex = cols[2]
    
    if pd.isnull(Age):
        if Pclass==1:
            if Sex=='male':
                return 40
            elif Sex=='female':
                return 35
            
        elif Pclass==2:
            if Sex=='male':
                return 30
            elif Sex=='female':
                return 28
            
        elif Pclass==3:
            if Sex=='male':
                return 25
            elif Sex=='female':
                return 22
            
    else:
        return Age

train_data['Age'] = train_data[['Age','Pclass','Sex']].apply(impute_age, axis=1)

In [None]:
train_data[train_data['Embarked'].isnull()].index.tolist()

In [None]:
train_data.drop('Cabin', axis=1, inplace=True)
train_data.dropna(inplace=True)
train_data.isnull().sum()

In [None]:
fig = px.box(test_data, x="Pclass", y="Age", color="Sex")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

In [None]:
def impute_age1(cols):
    Age = cols[0]
    Pclass = cols[1]
    Sex = cols[2]
    
    if pd.isnull(Age):
        if Pclass==1:
            if Sex=='male':
                return 42
            elif Sex=='female':
                return 41
            
        elif Pclass==2:
            if Sex=='male':
                return 28
            elif Sex=='female':
                return 24
            
        elif Pclass==3:
            if Sex=='male':
                return 24
            elif Sex=='female':
                return 22
            
    else:
        return Age

test_data['Age'] = test_data[['Age','Pclass','Sex']].apply(impute_age1, axis=1)

In [None]:
test_data.drop('Cabin', axis=1, inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)
test_data.isnull().sum()

In [None]:
test_data.shape[0]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

y = train_data["Survived"]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = pd.get_dummies(train_data[features])

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,8]
# Minimum number of samples required to split a node
min_samples_split = [2,8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=101)

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_Model = RandomForestClassifier()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, 
                                   param_distributions = param_grid, cv = 50, 
                                   verbose=2, n_jobs = 4)
rf_RandomGrid.fit(X_train, y_train)
rf_RandomGrid.best_params_

In [None]:
print (f'Train Accuracy - : {rf_RandomGrid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_RandomGrid.score(X_test,y_test):.3f}')

In [None]:
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=118, criterion='gini', max_depth=8, 
                               min_samples_split=8, min_samples_leaf=1, 
                               max_features='auto', 
                               max_leaf_nodes=None, bootstrap=True)
model.fit(X, y)
predictions = model.predict(X_test)
predictions
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission2.csv', index=False)