In [None]:
%%capture 
!pip install pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pathlib import Path
import matplotlib.pyplot as plt
from fastai import *
from fastai.tabular import *
import torch
import missingno as msno
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path = Path('/kaggle/input/titanic')
trpath = path/'train.csv'
cvpath = path/'test.csv'

df_train_raw = pd.read_csv(trpath)
df_test_raw = pd.read_csv(cvpath)

df_train = df_train_raw.copy(deep = True)
df_test  = df_test_raw.copy(deep = True)

data_cleaner = [df_train_raw, df_test_raw] #to clean both simultaneously

In [None]:
df_train.head(n=10)

In [None]:
varnames = list(df_train.columns)
for name in varnames:
    print(name+": ",type(df_train.loc[1,name]))

In [None]:
df_train.isnull().sum(axis=0)

In [None]:
msno.matrix(df_train)

In [None]:
msno.bar(df_test)

In [None]:
df_train.corr(method='pearson')['Age'].abs()

In [None]:
df_train['Title'] = df_train['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0].str.strip()
varnames = list(df_train.columns)
for name in varnames:
    print(name+": ",type(df_train.loc[1,name]))
    
print(list(df_train['Title'].unique()))    
df_test['Title'] = df_test['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0].str.strip()
df_test['Title'].unique()

In [None]:
def new_titles(df):
    new_titles = dict()
    assert 'Title' in df.columns
    for key in df['Title'].unique():
        females = ['Mrs','Miss','Ms','Mlle','Mme','Dona']
        males = ['Mr','Don']
        notable = ['Jonkheer','the Countess','Lady','Sir','Major','Col','Capt','Dr','Rev','Notable']
        titles = [females,males,notable,'Master']
        newtitles = ['Mrs','Mr','Notable','Master']
        idx = [key in sublist for sublist in titles]
        idx = np.where(idx)[0] 
        new_titles[key] = newtitles[idx[0]]
    return new_titles


new_titles_dict = new_titles(df_train)
df_train['Title'] = df_train['Title'].replace(new_titles_dict)

In [None]:
df_train['Cabin'][df_train['Cabin'].isnull()]='Missing'
df_train['Cabin'] = df_train['Cabin'].str.split(r'(^[A-Z])',expand = True)[1]

In [None]:
def df_fill(datasets, mode):
    assert mode =='median' or mode =='sampling'
    datasets_cp =[]
    np.random.seed(2)
    varnames = ['Age','Fare']
    for d in datasets:
        df = d.copy(deep = True)
        for var in varnames:
            idx = df[var].isnull()
            if idx.sum()>0:
                if mode =='median':
                    medians = df.groupby('Pclass')[var].median()
                    for i,v in enumerate(idx):
                        if v:
                            df[var][i] = medians[df['Pclass'][i]]
                else:
                    g = df[idx==False].groupby('Pclass')[var]
                    for i,v in enumerate(idx):
                        if v:
                            df[var][i] = np.random.choice((g.get_group(df['Pclass'][i])).values.flatten())
    #Embarked                 
        idx = df['Embarked'].isnull()
        g = df[idx==False].groupby('Pclass')['Embarked']
        for i,v in enumerate(idx):
            if v:
                df['Embarked'][i] = np.random.choice((g.get_group(df['Pclass'][i])).values.flatten())                   
    #Cabin
        df['Cabin'][df['Cabin'].isnull()]='Missing'
        df['Cabin'] = df['Cabin'].str.split(r'(^[A-Z])',expand = True)[1]
        datasets_cp.append(df)
    return datasets_cp

data_clean = df_fill(data_cleaner,'median')

In [None]:
def prepare_data(datasets):
        datasets_cp = []
        for d in datasets:
            df = d.copy(deep = True)
            df['Family onboard'] = df['Parch'] + df['SibSp']
            df['Title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0].str.strip()
            new_titles_dict = new_titles(df)
            df['Title'] = df['Title'].replace(new_titles_dict)
            df.drop(columns = ['PassengerId','Name','Ticket'],axis = 1, inplace = True)
            datasets_cp.append(df)
        return datasets_cp
        

In [None]:
train,test =prepare_data(df_fill(data_cleaner,mode = 'sampling'))  
print("Training data")
print(train.isnull().sum())
print("Test data")
print(test.isnull().sum())

In [None]:
def corr_matrix(x,y, quant = None):
    x_quants = x.quantile(quant) if quant else x.quantile([0, 0.25, 0.5, 0.75, 1])
    out = np.zeros((x_quants.shape[0]-1,int(y.unique().max()+1)))
    for i in range(x.shape[0]):
        comp = x[i]<=x_quants
        idx = int(next((j for j,compv in enumerate(comp) if compv),None))
        out[idx-1,int(y[i])]+=1
    return out.T,x_quants

def plot_corr_matrix(x,quants,fig, ax, **kwargs):
    assert x.shape[1] == quants.shape[0]-1
    cmap = kwargs['cmap'] if kwargs['cmap'] else 'Blues'
    ax.set_xlabel(kwargs['xlabel'])
    ax.set_ylabel(kwargs['ylabel'])
    ticks = np.arange(quants.shape[0])
    ax.set_xticks(ticks)
    ax.set_xticklabels(list(quants))
    if 'xlabel' and 'ylabel' in kwargs.keys():
        ax.title.set_text(f"{kwargs['xlabel']} vs {kwargs['ylabel']}")
    p = ax.pcolor(x,cmap = cmap)
    fig.colorbar(p,ax = ax)
    return fig,ax
    
    
def gen_corr_matrix(*args,quant = None,cmap = 'YlOrBr'):
    totalvars = len(args)
    assert totalvars>1
    
    out   = dict()
    out_q = dict()
    fig,axs = plt.subplots(1, totalvars-1, squeeze=False)
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=None)
    fig.figsize=(800, 800) 
    fig.suptitle("Correlation Matrix") if totalvars<3 else fig.suptitle("Correlation Matrices")
    for i in range(totalvars-1):
        out[i],out_q[i] = corr_matrix(args[0],args[i+1],quant)
        plot_corr_matrix(out[i], out_q[i],
                         fig,
                         axs[0,i],
                         cmap = cmap ,
                         xlabel = args[0].name,
                         ylabel = args[i+1].name)
    plt.show()

In [None]:
learn = tabular_learner(data, 
                        layers=[1000,500, 200,50, 15],
                        metrics=accuracy,
                        emb_drop=0.1
                       )


In [None]:
torch.device('cuda')
learn.fit_one_cycle(5, 2.5e-2)

In [None]:
learn.export('stage1')

In [None]:
learn.lr_find()

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, max_lr=slice(2e-4))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, max_lr=slice(5e-2))

In [None]:
predictions, *_ = learn.get_preds(DatasetType.Test)
labels = np.argmax(predictions, 1)
submission = pd.DataFrame({'PassengerId':df_test['PassengerId'],'Survived':labels})

In [None]:
submission.to_csv('submission-fastai.csv', index=False)