In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier  
from sklearn.preprocessing import StandardScaler,LabelEncoder  
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV,cross_validate 
from sklearn.metrics import (confusion_matrix, precision_score,recall_score,
                              classification_report,make_scorer,fbeta_score,roc_auc_score) 
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import xgboost
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier


In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
train = pd.read_csv(r'/kaggle/input/titanic/train.csv')
test = pd.read_csv(r'/kaggle/input/titanic/test.csv')
train.head()

In [8]:
train.info()

In [9]:
sns.countplot(data=train,x='Survived')

Event rate is 38% with 891 rows and few null values in age, cabin, and embarked

In [10]:
train['Survived'].value_counts(normalize = True)

In [11]:
train.describe()

In [12]:
train.columns

In [13]:
train.isnull().sum()

Cabin has almost 77% of the data is missing and it does not make sense to fill the missing values in this case
Embarked is replace with mode of the category, and Age is replaced with median

In [14]:
train.drop(['Cabin','Ticket'],axis=1,inplace=True)
test.drop(['Cabin','Ticket'],axis=1,inplace=True)

# 1.EDA and data visualization 

In [15]:
sns.catplot(data=train, x="Sex", y="Survived", hue="Embarked", kind="bar")
sns.catplot(data=train, x="Sex", y="Survived", hue="Pclass", kind="bar")

In [16]:
train['Age'].hist()

In [17]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

sns.boxplot(data=train,x="Survived", y="Age", hue='Sex',ax=ax[0][0])
sns.boxplot(data=train, x='Survived', y="Fare", hue='Sex',ax = ax[0][1])
sns.boxplot(data=train, x='Survived', y="SibSp", ax = ax[1][0])
sns.boxplot(data=train, x='Survived', y="Parch", ax = ax[1][1])

In [18]:
train['Sex'].value_counts()

In [19]:
g = sns.catplot(x="Fare", y="Survived", row="Pclass",
                kind="box", orient="h", height=1.5, aspect=4,
                data=train)
g.set(xscale="log")

In [20]:
g = sns.catplot(x="Fare", y="Survived", row="Embarked",
                kind="box", orient="h", height=1.5, aspect=4,
                data=train)
g.set(xscale="log")

# 2. Correlation

In [21]:
correlation = train.corr(method='pearson')
columns = correlation.nlargest(20, 'Survived').index
columns

In [22]:
plt.figure(figsize=(12,12)) 
correlation_map = np.corrcoef(train[columns].values.T)
sns.set(font_scale=1.0)
heatmap = sns.heatmap(correlation_map, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=columns.values, xticklabels=columns.values)

plt.show()

# 3. Feature engineering 

In [23]:
train.columns

In [24]:
train['Name'].head()

In [25]:
train['Name'] = train['Name'].str.extract('([A-Za-z]+)\.')
test['Name'] = test['Name'].str.extract('([A-Za-z]+)\.')

In [26]:
train['Name'].value_counts()

In [27]:
def convert_title(title):
    if title in ["Ms", "Mile", "Miss"]:
        return "Miss"
    elif title in ["Mme", "Mrs"]:
        return "Mrs"
    elif title == "Mr":
        return "Mr"
    elif title == "Master":
        return "Master"
    else:
        return "Other"
        
train["Title"] = train["Name"].map(convert_title)
test["Title"] = test["Name"].map(convert_title)

train["Title"].value_counts()

In [28]:
train.drop(['Name'],axis=1,inplace=True)
test.drop(['Name'],axis=1,inplace=True)

**Age**

In [29]:
train['Age'].isnull().sum()

In [30]:
train.groupby(['Title'])['Age'].mean()

In [31]:
train.loc[(train['Age'].isnull()) & (train["Title"]=='Master'), 'Age'] = 5
train.loc[(train['Age'].isnull()) & (train["Title"]=='Miss'), 'Age'] = 22
train.loc[(train['Age'].isnull()) & (train["Title"]=='Mr'), 'Age'] = 32
train.loc[(train['Age'].isnull()) & (train["Title"]=='Mrs'), 'Age'] = 36
train.loc[(train['Age'].isnull()) & (train["Title"]=='Other'), 'Age'] = 44

In [32]:
test.loc[(test['Age'].isnull()) & (test["Title"]=='Master'), 'Age'] = 5
test.loc[(test['Age'].isnull()) & (test["Title"]=='Miss'), 'Age'] = 22
test.loc[(test['Age'].isnull()) & (test["Title"]=='Mr'), 'Age'] = 32
test.loc[(test['Age'].isnull()) & (test["Title"]=='Mrs'), 'Age'] = 36
test.loc[(test['Age'].isnull()) & (test["Title"]=='Other'), 'Age'] = 44

**Fare**

In [33]:
test[test["Fare"].isna()]

In [34]:
train.groupby("Pclass").mean()["Fare"]

In [35]:
test['Fare'].fillna(train.groupby("Pclass").mean()["Fare"][3], inplace=True)

**Embarked**

In [36]:
train.loc[train['Embarked'].isnull(),'Embarked'] = train['Embarked'].mode()[0]
test.loc[test['Embarked'].isnull(),'Embarked'] = train['Embarked'].mode()[0]

**Pclass**

Pclass has negatuve correlation with target so to make it positive correlation change the lables

In [37]:
train["Pclass"].replace([1,2,3], [3,2,1], inplace=True)
test["Pclass"].replace([1,2,3], [3,2,1], inplace=True)

In [38]:
train.columns

In [39]:
train['Sex'] = train['Sex'].replace({'male':0,
                                    'female':1})
test['Sex'] = test['Sex'].replace({'male':0,
                                    'female':1})

In [40]:
def encode_and_bind(original_dataframe,features_to_encode):
    dummies = pd.get_dummies(original_dataframe[[features_to_encode]])
    res = pd.concat([original_dataframe,dummies],axis=1)
    return res

train = encode_and_bind(train,'Title')
test = encode_and_bind(test,'Title')

In [41]:
test = encode_and_bind(test,'Embarked')
train = encode_and_bind(train,'Embarked')

In [62]:
train['Pclass'] = train['Pclass'].astype(str)
test['Pclass'] = test['Pclass'].astype(str)
test = encode_and_bind(test,'Pclass')
train = encode_and_bind(train,'Pclass')

In [63]:
train.columns

In [43]:
train.drop(['Embarked','Title','Pclass'],axis=1,inplace=True)
test.drop(['Embarked','Title','Pclass'],axis=1,inplace=True)

In [66]:
train.columns

In [None]:
train['Family'] = train['SibSp'] + train['Parch']
test['Family'] = test['SibSp'] + test['Parch']

In [47]:
train['Fare'].describe()

In [48]:
bins = [-1,7,14,31,np.inf]
labels = [0,1,2,3]
train['Fare_bins'] = pd.cut(train['Fare'],bins=bins,labels=labels)
test['Fare_bins'] = pd.cut(test['Fare'],bins=bins,labels=labels)

In [73]:
train['Fare_bins'] = train['Fare_bins'].astype(int)
test['Fare_bins'] = test['Fare_bins'].astype(int)

In [49]:
train.drop(['SibSp','Parch'],axis=1,inplace=True)
test.drop(['SibSp','Parch'],axis=1,inplace=True)

In [75]:
X_train = train.drop(['PassengerId','Survived'],axis=1)
y_train = train[['Survived']]
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
X_test = test.copy()

In [76]:
std = StandardScaler()
cols = ['Age','Fare']
X_train[cols] = std.fit_transform(X_train[cols])
X_test[cols] = std.fit_transform(X_test[cols])

In [77]:
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "XGB": XGBClassifier(),
    "LGBM": LGBMClassifier()
}

results = dict()
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train, y_train, cv=5,
        scoring=('roc_auc')
    )

    results[name] = cv_results['test_score'].mean()
    
results

**Random Forest**

In [94]:
classifier = RandomForestClassifier(n_estimators = 100,
                                   criterion='gini',
                                   min_samples_split=10,
                                   min_samples_leaf=7)
classifier.fit(X_train,y_train)

In [95]:
y_train_hat = classifier.predict(X_train)
# y_test_hat = lr.predict(y)

print(classifier)
print('Logistic Regression')
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

# print('Test performance')
# print('-------------------------------------------------------')
# print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_train, y_train_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_train, y_train_hat))

In [105]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 800, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [4,5,6,7]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4,7]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [106]:
y_train_hat = rf_random.predict(X_train)
# y_test_hat = lr.predict(y)

print(rf_random)
print('Random Forest')
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

# print('Test performance')
# print('-------------------------------------------------------')
# print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_train, y_train_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_train, y_train_hat))

In [111]:
#feature importance
feat_imp = pd.DataFrame(rf_random.best_estimator_.feature_importances_,index=X_train.columns, columns=['Importance'])
feat_imp.sort_values(by='Importance',ascending=False,inplace=True)
feat_imp.plot(kind='bar',figsize=(8,6))

In [112]:
#gain curve
import scikitplot as skplt
predict_prob = rf_random.predict_proba(X_train)
skplt.metrics.plot_cumulative_gain(y_train,predict_prob)
plt.show()

**XGBoost**

In [128]:
params = {'max_depth': [3, 4, 5],
              'learning_rate': [0.01, 0.05,0.1, 0.2],
              'subsample': np.arange(0.4, 1.0, 0.2),
              'colsample_bytree': np.arange(0.4, 1.0, 0.2),
              'colsample_bylevel': np.arange(0.4, 1.0, 0.2),
              'n_estimators': [100, 250, 500, 750]
              }

xgbclf = XGBClassifier( tree_method='hist')
clf = RandomizedSearchCV(estimator=xgbclf,
                         param_distributions=params,
                         scoring='accuracy',
                         n_iter=10,
                         n_jobs=-1)

clf.fit(X_train, y_train)

In [129]:
clf.best_params_

In [130]:
y_train_hat = clf.predict(X_train)
# y_test_hat = lr.predict(y)

print(clf)
print('Xgboost')
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

# print('Test performance')
# print('-------------------------------------------------------')
# print(classification_report(y_test, y_test_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_train, y_train_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_train, y_train_hat))

In [58]:
X_test.columns
X_test = X_test.drop(['PassengerId'],axis=1)

In [131]:
y_pred = clf.predict(X_test)

In [132]:
submission = pd.read_csv(r'/kaggle/input/titanic/gender_submission.csv')
submission['Survived'] = y_pred

In [134]:
submission.to_csv(r'/kaggle/working/titanic_prediction_xgb_gridsearch1.csv',index=False)