# Hi!)

**Hope you'll find this notebook usefull ^_^**

In [None]:
# Importing the dataset

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

sns.set_style('whitegrid')
plt.rc('patch', edgecolor='black')
sns.set_context("notebook", font_scale=1.3, rc={"lines.linewidth": 1.5})

# Exploratory Data Analysis

In [None]:
data = pd.read_csv('../input/titanic/train.csv')
data.head()

In [None]:
data.shape

**We'll dpop "Cabin" and "Ticket" because these columns are meaningless for analysis**

In [None]:
data.drop(['Ticket','Cabin'], axis = 1, inplace = True)

In [None]:
data.info()

In [None]:
#number of missing values

data.isna().sum()

**3 of 12 columns contain null values. We'll fix it!**

In [None]:
f,ax=plt.subplots(1,3,figsize=(18,6))
sns.countplot(data.Survived,ax=ax[0])
ax[0].set_title('Survived')
sns.countplot(data.Sex,hue=data.Survived,ax=ax[1])
ax[1].set_title('Sex <--> Survived')
sns.countplot(data.Pclass,hue=data.Survived,ax=ax[2])
ax[2].set_title('Pclass <--> Survived')

In [None]:
t1 = pd.crosstab([data.Sex,data.Pclass],data.Survived.round(3)*100,margins = True).style.background_gradient(vmin=0)

In [None]:
t2 = pd.crosstab([data.Embarked,data.Pclass],data.Survived.round(3)*100,margins = True).style.background_gradient(vmin=0)

In [None]:
t3 = pd.pivot_table(data, 'Fare','Pclass',aggfunc=['count', 'sum', 'mean']).style.background_gradient(vmin=0)

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

multi_table([t1, t2, t3])

In [None]:
sns.boxplot(data.Fare)

In [None]:
data[data.Fare > 500]

**By the fact, these people, whose Fare was enormous, are:**
* https://www.encyclopedia-titanica.org/titanic-survivor/annie-moore-ward.html
* https://www.encyclopedia-titanica.org/titanic-survivor/thomas-cardeza.html
* https://www.encyclopedia-titanica.org/titanic-survivor/gustave-lesueur.html

In [None]:
sns.jointplot('Age','Fare',data)

In [None]:
#exclude PassengerId from pairplot

sns.pairplot(data.iloc[:,1:])

In [None]:
plt.figure(figsize=(8,6))
sns.violinplot('Sex','Age',hue = 'Survived',data=data,split=True)

In [None]:
plt.figure(figsize=(8,6))
sns.violinplot("Pclass","Age", hue="Survived", data=data,split=True)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,6))
sns.countplot(data.Survived,hue=data.SibSp,ax=ax[0])
ax[0].set_title('Survived <--> SibSp')
sns.countplot(data.Survived,hue=data.Parch,ax=ax[1])
ax[1].set_title('Survived <--> Parch')

**Let's check if "Parch" and "SibSp" are multicollinear**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = data[['SibSp', 'Parch']]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
from scipy.stats import pearsonr

pearsonr(data.SibSp, data.Parch)

**No, "Parch" and "SibSp" are not multicollinear. However, we can slill unite these features into one "Number_of_relatives"**

In [None]:
data['Number_of_relatives'] = data.SibSp + data.Parch
data.drop(data[['SibSp','Parch']],axis = 1, inplace = True)

In [None]:
sns.countplot(data['Number_of_relatives'])

In [None]:
data.nunique()

In [None]:
data.value_counts('Embarked')

In [None]:
sns.factorplot('Embarked','Survived',data=data)

**You see, people from Cherbourg had a much higher chance to survive!**

In [None]:
print('Oldest Passenger was of:',data['Age'].max(),'Years')
print('Youngest Passenger was of:',data['Age'].min(),'Years')
print('Average Age on the ship:',data['Age'].mean(),'Years')

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(data.corr(), square=True, mask = np.triu(data.corr()), cmap= "coolwarm",linewidths=0.2, annot = True,fmt='.2f',annot_kws={'size':9})

#  Feature Engineering

In [None]:
data.query('Pclass == 1').groupby('Pclass').agg({'Fare':'mean'})

In [None]:
data.Embarked.fillna('S', inplace = True)

In [None]:
data.isna().sum()

In [None]:
age_mean = data['Age'].mean()
age_std = data['Age'].std()
null_values_count = data['Age'].isnull().count()
age_null_random_list = np.random.randint(age_mean-age_std,age_mean+age_std,size = null_values_count)
age_null_random_list

In [None]:
data['Age'][np.isnan(data['Age'])] = age_null_random_list
data['CategoricalAge'] = pd.cut(data['Age'], 5, precision=0)
data[["CategoricalAge", "Survived"]].groupby('CategoricalAge', as_index = False).mean().sort_values(by='Survived', ascending=False)

In [None]:
data['Fare'] = data['Fare'].fillna(data['Fare'].median()) #заполнили пропуски медианой
data['CategoricalFare'] = pd.qcut(data['Fare'], 4, precision=0)
data[["CategoricalFare", "Survived"]].groupby("CategoricalFare").mean().sort_values(by = 'Survived', ascending = False)

In [None]:
def rightValue(interval):
  return interval.right

data['CategoricalAge'] = data['CategoricalAge'].apply(rightValue)
data['CategoricalFare'] = data['CategoricalFare'].apply(rightValue)
data.head()

In [None]:
sns.factorplot('CategoricalAge','Survived',data=data,col='Pclass')

In [None]:
data.drop(['Name','Age','Fare'], axis = 1, inplace = True)

In [None]:
data.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for feature in ['CategoricalFare','CategoricalAge']:
    data[feature]=encoder.fit_transform(data[feature])

In [None]:
data.head()

In [None]:
data.Sex = data.Sex.astype('category').cat.codes

In [None]:
data = pd.get_dummies(data, columns = ["Embarked"])

# **ML models**

**We will try to use different models to find the best one**

In [None]:
X = data.drop('Survived',axis = 1)
y = data['Survived']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

classifiers = [
    KNeighborsClassifier(5),
    SVC(probability=True, kernel='rbf'),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB(),
    LogisticRegression(solver = 'liblinear'),
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

acc_dict = {}

for clf in classifiers:
    name = clf.__class__.__name__
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    if name in acc_dict:
        acc_dict[name] += acc
    else:
        acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf]
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log.sort_values(by='Accuracy'), color="b")

**Thus, RandomForestClassifier and GaussianNB have the best accuracy score.
I will use RandomForestClassifier for submissions.**