In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import string
from scipy import stats

%matplotlib inline

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
train.head()

In [None]:
test = pd.read_csv("../input/titanic/test.csv")
test.head()

In [None]:
train['Survived'].unique()
train['Parch'].unique()
train['Pclass'].unique()

In [None]:
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

In [None]:
null_columns=test.columns[test.isnull().any()]
test[null_columns].isnull().sum()

In [None]:
train_pid = train['PassengerId']

In [None]:
test_pid = test['PassengerId']

In [None]:
del train['PassengerId']
del train['Cabin']

In [None]:
del test['PassengerId']
del test['Cabin']

In [None]:
train['Age'][train['Sex']=='female'] = train['Age'][train['Sex']=='female'].fillna(train['Age'][train['Sex']=='female'].std())
train['Age'][train['Sex']=='male'] = train['Age'][train['Sex']=='male'].fillna(train['Age'][train['Sex']=='male'].std())

In [None]:
test['Age'][test['Sex']=='female'] = test['Age'][test['Sex']=='female'].fillna(test['Age'][test['Sex']=='female'].std())
test['Age'][test['Sex']=='male'] = test['Age'][test['Sex']=='male'].fillna(test['Age'][test['Sex']=='male'].std())

In [None]:
train.dropna(inplace = True)

In [None]:
test['Fare'] = test['Fare'].fillna(test['Fare'].std())

In [None]:
null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

In [None]:
null_columns=test.columns[test.isnull().any()]
test[null_columns].isnull().sum()

In [None]:
train['Age'].skew()

In [None]:
test['Age'].skew()

In [None]:
plt.subplots(figsize =(15, 4))

plt.subplot(1, 2, 1)
train['Age'].hist()

plt.subplot(1, 2, 2)
sns.regplot(x= train['Age'] , y=train['Survived'] , fit_reg=True)

In [None]:
train['Age'] = np.sqrt(train['Age'])
train['Age'].skew()

In [None]:
test['Age'] = np.sqrt(test['Age'])
test['Age'].skew()

In [None]:
plt.subplots(figsize =(15, 4))

plt.subplot(1, 2, 1)
train['Age'].hist()

plt.subplot(1, 2, 2)
sns.regplot(x= train['Age'] , y=train['Survived'] , fit_reg=True)

In [None]:
plt.subplots(figsize =(15, 4))

plt.subplot(1, 3, 1)
sns.boxplot(x=train['Age'])

plt.subplot(1, 3, 2)
sns.boxplot(x="Age", y="Sex", data=train);

plt.subplot(1, 3, 3)
sns.boxplot(x="Age", y="Embarked", data=train);


In [None]:
train['Family_Size']=train['SibSp']+train['Parch']
del train['SibSp']
del train['Parch']

In [None]:
test['Family_Size']=test['SibSp']+test['Parch']
del test['SibSp']
del test['Parch']

In [None]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

In [None]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

train['Title']=train['Name'].map(lambda x: substrings_in_string(x, title_list))
test['Title']=test['Name'].map(lambda x: substrings_in_string(x, title_list))
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
train['Title']=train.apply(replace_titles, axis=1)
del train['Name']

test['Title']=test.apply(replace_titles, axis=1)
del test['Name']

In [None]:
train.sort_values(by='Ticket').head()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_sex = LabelEncoder()
label_embark = LabelEncoder()
label_title = LabelEncoder()


train['Sex'] = label_sex.fit_transform(train['Sex'])
train['Embarked'] = label_embark.fit_transform(train['Embarked'])
train['Title'] = label_title.fit_transform(train['Title'])

test['Sex'] = label_sex.fit_transform(test['Sex'])
test['Embarked'] = label_embark.fit_transform(test['Embarked'])
test['Title'] = label_title.fit_transform(test['Title'])


In [None]:
corr = train.corr()
plt.subplots(figsize=(8, 8))
cmap = sns.diverging_palette(150, 250, as_cmap=True)
sns.heatmap(corr, cmap="RdYlBu", square=True, linewidths=0, cbar_kws={"shrink": .5}, annot = True);

In [None]:
train['Fare'] = np.abs(stats.zscore(train['Fare']))

In [None]:
test['Fare'] = np.abs(stats.zscore(test['Fare']))

In [None]:
del train['Ticket']

In [None]:
del test['Ticket']

In [None]:
y_class = train['Survived']
del train['Survived']

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train,y_class,test_size=0.3 , random_state=101)

In [None]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(train, y_class)                  # 3. fit model to data
y_model = model.predict(test)             # 4. predict on new data

In [None]:
from sklearn.metrics import accuracy_score
#accuracy_score(y_test, y_model)

In [None]:
y_model[0:10]

In [None]:
#y_test.iloc[0:10]

In [None]:
gender_sub = pd.read_csv("../input/titanic/gender_submission.csv" , usecols=['Survived'])
gender_sub.head()

In [None]:
#test_data = model.predict(test)

In [None]:
accuracy_score(gender_sub, y_model)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_pid,
        "Survived": y_model
    })
submission.to_csv('gender_sub.csv', index=False)