In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import scipy.stats as stat
from sklearn.preprocessing import StandardScaler
pd.pandas.set_option('display.max_columns', None)
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing

In [None]:
df=pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:

df.head()

In [None]:
df.dtypes

# Categorical Features

In [None]:
categorical_features=[features for features in df.columns if df[features].dtypes=='object']
categorical_features

# Numerical Features

In [None]:
numerical_features=[feature for feature in df.columns if df[feature].dtypes!='object']
numerical_features

# Creating some new variables


In [None]:
# Extracting Titles fron names
list1=[]
k=0
for j in df.Name:
    
    list1.append(str(j).split()[1])
    k+=1
    

In [None]:
df['Title']=list1
df.head(1)

In [None]:
# Family Size
df['Family_size']=df['SibSp']+df['Parch']+1
df.head(1)

In [None]:
# Age groups
list2=df.Age.tolist()
list3=['teen-age' if z<18 else 'adult' if z<60 else 'old' for z in list2]
df['Age_group']=list3
df.head(1)

# Missing values

In [None]:
df.isnull().sum()

In [None]:
mode=df.Embarked.value_counts().index[0]
df['Embarked'].fillna(mode,inplace=True)

In [None]:
# for Age column we will use random sample imputer
def impute_age(df,variable):
    df[variable+'random']=df[variable]
    random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0)
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable+'random']=random_sample

In [None]:
impute_age(df,'Age')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
df['Agerandom'].plot(kind='kde', ax=ax)
df.Age.plot(kind='kde', ax=ax, color='green')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')


#### this figure shows that imputing missing values with random samples has not changed the distribution of data


In [None]:
# extracting the cabin names
list4=[]
df['Cabin']=df["Cabin"].astype(str)
for z in df['Cabin']:
    list4.append(str(z.split()[0][0]))
    

In [None]:
df['Cabin_new']=list4

In [None]:
df.Cabin_new.value_counts()

In [None]:
# since m indicates missing values we will replace it by missing
df['Cabin_new'].replace('n','missing',inplace=True)

# Visualization

In [None]:
df.Survived.value_counts().plot(kind='bar')
print(df.Survived.value_counts())

In [None]:
df.Sex.value_counts().plot(kind='bar')
print(df.Sex.value_counts())

In [None]:
df.Pclass.value_counts().plot(kind='bar')
print(df.Pclass.value_counts())

In [None]:
df.Cabin_new.value_counts().plot(kind='bar')
print(df.Cabin_new.value_counts())

In [None]:
df.Embarked.value_counts().plot(kind='bar')
print(df.Embarked.value_counts())

In [None]:
df.Title.value_counts().plot(kind='bar')
print(df.Title.value_counts().head(10))

In [None]:
df.Age_group.value_counts().plot(kind='bar')
print(df.Age_group.value_counts())

In [None]:
sns.distplot(df.Agerandom)

In [None]:
sns.distplot(df.Fare)

In [None]:
df.Parch.value_counts().plot(kind='bar')
print(df.Parch.value_counts())

In [None]:
df.SibSp.value_counts().plot(kind='bar')
print(df.SibSp.value_counts())

In [None]:
df.Age_group.value_counts().plot(kind='bar')
print(df.Age_group.value_counts())

# Bivariate Analysis

In [None]:
sns.catplot(x = 'Sex', hue = 'Survived',data = df, kind = 'count')

In [None]:
sns.catplot(x = 'Cabin_new', hue = 'Survived',data = df, kind = 'count')

In [None]:
sns.catplot(x = 'Age_group', hue = 'Survived',data = df, kind = 'count')

In [None]:
sns.catplot(x = 'Embarked', hue = 'Survived',data = df, kind = 'count')

# Encoding

In [None]:
# Pclass
label_encoder = preprocessing.LabelEncoder() 
df['Pclass']=label_encoder.fit_transform(df['Pclass'])

In [None]:
# Sex
sex=pd.get_dummies(df['Sex'],drop_first=True)

In [None]:
# Embarked
Embarked=pd.get_dummies(df['Embarked'],drop_first=True)

In [None]:
# titles
### since we have a lot of titles so we will take top 4 and mark rest as 0
most_frequent_titles=df.Title.value_counts().sort_values(ascending=False).head(4).index
most_frequent_titles=list(most_frequent_titles)
most_frequent_titles

In [None]:
for titles in most_frequent_titles:
    df[titles]=np.where(df['Title']==titles,1,0)

In [None]:
# Age group
age_group=pd.get_dummies(df['Age_group'],drop_first=True)

In [None]:
#cabin
cabin=pd.get_dummies(df['Cabin_new'],drop_first=True)


In [None]:
df.head(1)

In [None]:
df.drop(['PassengerId','Name','Sex','Ticket','Embarked','Cabin','Title','Age_group','Cabin_new','Age'],axis=1,inplace=True)

In [None]:
df=pd.concat([df,age_group,cabin,Embarked,sex],axis=1)

In [None]:
df.head()

In [None]:
df.shape

# Hyper parameter optimizaton

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42)

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [None]:
X=df.iloc[:,1:]
y=df.iloc[:,0]
X.shape
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print('the shape of x_train is {}'.format(X_train.shape))
print('the shape of x_test is {}'.format(X_test.shape))
print('the shape of y_train is {}'.format(y_train.shape))
print('the shape of y_test is {}'.format(y_test.shape))

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
pred=rf_random.predict(X_test)

In [None]:
confusion_matrix(y_test,pred)

In [None]:
accuracy_score(y_test,pred)