# 1) Libraries & Toolkits 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import warnings

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize':(12,8)} ,font_scale=1.2)

# 2) Importing Dataset and exploration

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
print(train.shape)
train.head()

In [None]:
test  = pd.read_csv('../input/titanic/test.csv')
print(test.shape)
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# - Number of uniques values in dataframe

pd.DataFrame({'': train.nunique().index , 'Number of unique values':  train.nunique().values})

In [None]:
train.describe().T

# 3) Exploratory data analysis

In [None]:
train[train.Sex == 'male' ]['Survived'].value_counts(normalize=True).mul(100).round(0).astype(str) + ' %'

In [None]:
train[train.Sex == 'female' ]['Survived'].value_counts(normalize=True).mul(100).round(0).astype(str) + ' %'

In [None]:
sns.countplot(data=train, x='Survived', hue='Sex' , palette='Set2')

In [None]:
sns.countplot(data=train, hue='Pclass', x='Survived', palette='Set2')

In [None]:
sns.catplot(data=train, x='Survived', hue='Sex', kind='count', col='Pclass' , palette='Set2')

In [None]:
sns.countplot(data=train, x='Embarked', hue='Pclass' , palette='Set2')

In [None]:
sns.kdeplot(train['Age'], shade=True, color='g')

# 4) Pre-processing & Feature Engineering

In [None]:
## Fill Nan Values
train['Age'] = train['Age'].fillna(round(train['Age'].mean()))
test['Age'] = test['Age'].fillna(round(test['Age'].mean()))
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().idxmax())
test['Fare'] = test['Fare'].fillna(test['Fare'].value_counts().idxmax())


In [None]:
def title(name):
    return name.split(',')[1].split('.')[0]

def age_categories(age):
    if age <= 3:
        return 'Infant'
    elif age <= 14 :
        return 'Child'
    elif age <= 28 :
        return 'Youth'
    elif age <= 60 :
        return 'Adult'
    else :
        return 'Old Adult'


In [None]:
train['Title'] = train['Name'].apply(title)
test['Title'] = test['Name'].apply(title)

In [None]:
train['Alone'] = train['SibSp'] + train['Parch']
test['Alone'] = test['SibSp'] + test['Parch']
train.Alone = train.Alone.apply(lambda x : 'Yes' if x == 0 else 'No')
test.Alone = test.Alone.apply(lambda x : 'Yes' if x == 0 else 'No')


In [None]:
train['Age_Group'] = train['Age'].apply(age_categories)
test['Age_Group'] = test['Age'].apply(age_categories)
train = train.drop(columns=['Name' , 'Cabin' , 'PassengerId'])
test = test.drop(columns=['Name' , 'Cabin'])
train.head()

In [None]:
test.head()

In [None]:
lbl=LabelEncoder()
data = [train , test]
nums = []
for dataset in data:
    for col in dataset.loc[ : , dataset.dtypes == np.object].columns:
        dataset[col] = lbl.fit_transform(dataset[col])
        directory = {label: index for index , label  in enumerate(lbl.classes_)}
        nums.append(directory)
nums[0]

In [None]:
train.sample(5)

In [None]:
test.sample(5)

# 5) Modeling

In [None]:
x_train = train.drop("Survived", axis=1)
y_train = train[["Survived"]]
x_test  = test.drop(columns=['PassengerId']).copy()

x_train.shape, y_train.shape, x_test.shape 

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

model = rfc.fit(x_train , y_train)
pred = model.predict(x_test)

print("Accuracy on Test Data : {:,.2f}".format(rfc.score(x_train , y_train) *100 ) , '%')

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": pred
    })
submission

In [None]:
# submission.to_csv('submission.csv', index=False)