In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Kernel Content

1. [Load Data](#1)
1. [Data Analysis](#2)
1. [Basic Data Analysis](#3)
1. [Random Forest](#4)
1. [Test](#5)

<a id=1></a>
## Data Read

In [None]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
df_train.head()

## The columns of the data set

1. PassengerId
1. Survived: 0->Died 1->Alive
1. Pclass: 1-> 1st Grade 2-> 2nd Grade 3-> 3rd Grade 
1. Name
1. Sex
1. Age 
1. SibSp: (number of) Siblings/Spouses
1. Parch: (number of) Parent/Children
1. Ticket
1. Fare
1. Cabin
1. Embarked: C->Cherbourg, Q->Queenstown, S->Southampton

## Variable Types
1. Categorial Variable
    Survived, Sex, Pclass, Embarked, Cabin, Name, Sibsn, Ticket and Parch
2. Numerical Variable 
    Fare, Age, PassengerId

In [None]:
df_train.columns

In [None]:
df_train.describe().T

In [None]:
df_train.info()

* int64      [PassengerId, Survived, Pclass, SibSp, Parch]
* float64                                      [Age, Fare]
* object              [Name, Sex, Ticket, Cabin, Embarked]

<a id=2></a>
## Data Analysis 




## Univariate Variable Analysis

### Categorical Variables

1. Survived
1. Sex 
1. Pclass 
1. Embarked
1. Cabin
1. Name 
1. Sibsn 
1. Parch
1. Ticket


In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Survived);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Sex);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Pclass);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Embarked);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.SibSp);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Parch);

The Name, Ticket and Cabin columns have unique values, so no visualization has been made.

### Numerical Variabels

1. Fare
1. Age 
1. PassengerId (Not visualized)

In [None]:
plt.figure(figsize=(15,5))
sns.distplot(df_train.Age, hist=True, kde=True, color='r');
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.distplot(df_train.Fare, hist=True, kde=True, color='g')
plt.show()

<a id=3></a>
## Basic Data Analysis

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Sex, hue=df_train.Survived, palette='pastel');

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(df_train.Pclass, hue=df_train.Survived, palette='pastel');

### Missing Values

In [None]:
df = pd.concat([df_train, df_test])

In [None]:
df.isnull().sum()

In [None]:
df[df['Embarked'].isnull()]

In [None]:
sns.catplot(x="Embarked", y="Fare", kind="box", data=df);

In [None]:
df['Embarked'] = df["Embarked"].fillna("C")

In [None]:
df[df['Fare'].isnull()]

In [None]:
df['Fare'] = df["Fare"].fillna(df.Fare.mean())

In [None]:
sns.distplot(df.Age);

In [None]:
df.Age.mean()

In [None]:
df.Age.median()

In [None]:
df['Age'] = df["Age"].fillna(df.Age.mean())

In [None]:
df.head()

In [None]:
df = df.drop(['Name', "PassengerId", "Ticket", "Cabin"], axis=1)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df["Sex"])
df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()

In [None]:
train = df.iloc[0: (df.shape[0] - df_test.shape[0])]
test = df.iloc[df_train.shape[0]:]

In [None]:
X = train.drop(['Survived'], axis=1)
y = train.Survived

<a id=4></a>
## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
rf_params = {'n_estimators': [400,500,600,700],
            'max_features': [5,6,7,8,9,10],
            'min_samples_split':[5,6,7,8,9,10]}

In [None]:
from sklearn.model_selection import GridSearchCV
rf_cv_model = GridSearchCV(rf, rf_params, cv=21, n_jobs=-1, verbose=1).fit(X, y)


In [None]:
rf_cv_model

In [None]:
best_params = rf_cv_model.best_params_
print(best_params)

In [None]:
rf = RandomForestClassifier(
    max_features=best_params['max_features'], 
    min_samples_split=best_params['min_samples_split'], 
    n_estimators=best_params['n_estimators']
).fit(X, y)

In [None]:
y_pred_rf = rf.predict(X)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_pred_rf)

In [None]:
rf.feature_importances_

In [None]:
feature_imp = pd.Series(rf.feature_importances_,
                       index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 7))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Değisken Önem Skorları')
plt.ylabel('Değişkenler')
plt.title('Değişken Önem Düzeyleri')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(rf, X, y, cv=7).mean()

In [None]:
from imblearn.metrics import classification_report_imbalanced, sensitivity_specificity_support
print('sensitivity and specificity:', sensitivity_specificity_support(y, y_pred_rf, average='micro', labels=pd.unique(df_train.Survived)))
print(classification_report_imbalanced(y, y_pred_rf))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, y_pred_rf)
sns.heatmap(cm, annot=True, fmt="d", cbar=False)
plt.title('Confusion Matrix')
plt.savefig('con_mat')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
rf_roc_auc = roc_auc_score(y, rf.predict(X))
fpr , tpr, thresholds = roc_curve(y, rf.predict_proba(X)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % rf_roc_auc)
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')
plt.show()

<a id=5></a>
## Test

In [None]:
submission.head()

In [None]:
test.head()

In [None]:
test = test.drop(['Survived'], axis=1)

In [None]:
submission = df_test.PassengerId.copy().to_frame()
predictions = rf.predict(test)

In [None]:
predictions = [int(i) for i in predictions]
submission['Survived'] = predictions

In [None]:
submission.to_csv("submission.csv", index = False)