# Titanic Solution using fastai

# 1. DATA UNDERSTANDING

## Load Libraries and Datasets

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import fastai
from fastai import *
from fastai.tabular.all import *
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
fastai.__version__

In [None]:
df_train = pd.read_csv('../input/titanic-extended/train.csv')
df_test = pd.read_csv('../input/titanic-extended/test.csv')

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.head()

In [None]:
df_train.info()

# 2. EDA leveraging fastai

## Let's understand the Correlation between variables

In [None]:
import seaborn as sns
correlation = df_train.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation,annot = True)

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head()

## Survivals - by GENDER

In [None]:
# Gender that survives the most
survival_by_gender = pd.crosstab(index = df_train['Survived'],columns = df_train['Sex'])
survival_by_gender.plot.bar()

# This means more FEMALE passengers survived compared to MALE

## Survivals - By AGE

In [None]:
# AGE that survives the most
survival_by_age = pd.crosstab(index = df_train['Age'],columns = df_train['Survived'])
survival_by_age.plot.line()
#survival_by_age.plot.hist()

In [None]:
df_train.head()

In [None]:
df_test.head()

## Let's Encode GENDER variable

In [None]:
df_train.replace(('male', 'female'), (1, 0), inplace=True)
df_test.replace(('male', 'female'), (1, 0), inplace=True)

## Let's drop CABIN variable 

In [None]:
df_train = df_train.drop(['Cabin'], axis=1)
df_test = df_test.drop(['Cabin'], axis=1)

In [None]:
df_train = df_train.drop(['WikiId'], axis=1)
df_test = df_test.drop(['WikiId'], axis=1)

In [None]:
df_train = df_train.drop(['Body'], axis=1)
df_test = df_test.drop(['Body'], axis=1)

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head()

## Leverage fastai classes to get categorical, continuous variables

In [None]:
%time

# cont_names = Continuous variables in the dataset
# cat_names = Categorical variables in the dataset
procs = [Categorify, FillMissing, Normalize] 
splits = RandomSplitter(valid_pct = 0.21)(range_of(df_train))
cont_names, cat_names = cont_cat_split(df_train, 1, 'Survived')

In [None]:
cat_names

In [None]:
cont_names

In [None]:
to = TabularPandas(df_train,procs,cat_names,cont_names,y_names='Survived',splits=splits)
to.show(5)

In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

# 3. MODELING

In [None]:
x_train, y_train = to.train.xs, to.train.y
x_test, y_test = to.valid.xs, to.valid.y

# 3.1 Random Forest

In [None]:
%time

# We are just using Random Forest from sklearn
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf_classifier.fit(x_train, y_train)

# 3.2 Logistic Regression

In [None]:
# We are just using Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(solver='lbfgs', max_iter=5000)
lr_classifier.fit(x_train, y_train)

# 3.3 LDA (Linear Discriminant Analysis)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(x_train, y_train)

# 4. MODEL EVALUATION

# 4.1 Random Forest Score

In [None]:
from sklearn.metrics import accuracy_score
y_pred = rf_classifier.predict(x_test)
accuracy_score(y_test, y_pred)

# 4.2 Logistic Regression Score

In [None]:
y_pred = lr_classifier.predict(x_test)
accuracy_score(y_test, y_pred)

# 4.3 LDA Score

In [None]:
y_pred = lda_classifier.predict(x_test)
accuracy_score(y_test, y_pred)

# 5. GENERATE FINAL OUTCOME

In [None]:
to_test = TabularPandas(df_test, procs, cat_names, cont_names)
outcome = rf_classifier.predict(to_test.xs.drop('Fare_na', axis=1))
output= pd.DataFrame({'PassengerId':df_test.PassengerId, 'Survived': outcome.astype(int)})
output.to_csv('./submission_titanic.csv', index=False)