# Importing libraries

In [None]:
# computation
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# configuration
sns.set_style('whitegrid')
%matplotlib inline

# Import data

In [None]:
# getting path to train and test data
train_data_path = '../input/titanic/train.csv'
test_data_path = '../input/titanic/test.csv'

train = pd.read_csv(train_data_path,index_col=None)
test = pd.read_csv(test_data_path,index_col=None)

# Getting know your data

Checking a dtypes of variables and null values

In [None]:
print('====== Test data - info ======')
print(test.info())
print('\n')
print('====== Train data - info ======')
print(train.info())

# Filling a null values

In [None]:
fig = plt.figure(figsize=(10,5), dpi = 100)

axe1 = sns.heatmap(train.isnull(),
            cmap = 'coolwarm',
            yticklabels = False,
            cbar = False)

In [None]:
# define an age depending on Pclass
sns.boxplot(data = train,x = 'Pclass',y = 'Age')

### Imputing null values

In [None]:
classes = train.Pclass.unique()

classes_mean_age = {}

for _ in classes:
       classes_mean_age[_] = train[train['Pclass'] == _ ]['Age'].mean()

def fill_na_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        for _ in classes_mean_age:
            if Pclass == _:
                Age = classes_mean_age[_]
                return Age
    else:
        return Age

train['Age'] = train[['Age','Pclass']].apply(fill_na_age,axis = 1)
test['Age'] = test[['Age','Pclass']].apply(fill_na_age,axis = 1)

### Dropin' unnecessary features

In [None]:
train.drop(['PassengerId','Name','Ticket','Cabin'],axis = 1,inplace = True)
train.dropna(axis = 0,inplace=True)

test.drop(['Name','Ticket','Cabin'],axis = 1,inplace = True)
test.dropna(axis = 0,inplace=True)

## Exploratory data analysis

In [None]:
# setting style
plt.style.use('ggplot')

# creating figure
fig = plt.figure(figsize = (20,10))

# creating axis
ax1 = plt.subplot2grid((3,3),(0,0))
ax2 = plt.subplot2grid((3,3),(0,1))
ax3 = plt.subplot2grid((3,3),(1,0))
ax4 = plt.subplot2grid((3,3),(1,1))
ax5 = plt.subplot2grid((3,3),(2,0),colspan =2)

# Survival
train.groupby('Survived').size().plot(kind='bar',
                                      cmap = 'winter',
                                      width = 0.8,
                                      ax = ax1)
# Survival by sex
train.groupby(['Sex','Survived']).size().unstack().plot(kind='bar',
                                                        width = 0.8,
                                                        cmap = 'winter',
                                                        ax = ax2)
# Survival by Pclass
train.groupby(['Pclass','Survived'])\
                .size()\
                .unstack()\
                .plot(kind='Bar',stacked = True,
                width = 0.8,
                cmap = 'winter',
                ax = ax3)

# Survival by gender and age distribution
train[(train['Sex'] == 'male') & (train['Survived'] == 0)]['Age'].plot('hist',
                                                                       color = 'Blue',
                                                                       alpha = 0.8,
                                                                       ax=ax4,
                                                                       label = 'Male / Died')
train[(train['Sex'] == 'female') & (train['Survived'] == 0)]['Age'].plot('hist',
                                                                         color = 'lime',
                                                                         alpha = 0.8,
                                                                         ax=ax4,
                                                                         label = 'Female / Died')
ax4.set_xlabel('Age')
ax4.legend()

# Survival by sex and pclass
train.groupby(['Sex','Pclass','Survived'])\
                .size()\
                .unstack()\
                .plot(kind='Bar',
                stacked = True,width = 0.8,
                cmap = 'winter',
                ax = ax5)

# adding titles
ax1.set_title('Survivalness')
ax2.set_title('Survivalness by sex')
ax3.set_title('Survivalness by pclass')
ax4.set_title('Age distribution by sex - Died')
ax5.set_title('Survivalness by pclass and sex')

# adding axes labels
axes_list = [ax1,ax2,ax3,ax4,ax5]

for ax in axes_list:
    ax.set_ylabel('# Of records')
    
plt.tight_layout()

## Building a model

### Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

# train
encoder = LabelEncoder()

X = train.drop('Survived',axis = 1)
y = train['Survived']

X['Sex'] = encoder.fit_transform(X['Sex'])
X['Embarked'] = encoder.fit_transform(X['Embarked'])


dummies_enbarked = pd.get_dummies(X['Embarked'],prefix='Emb',drop_first=True)
X.drop('Embarked',1,inplace=True)
X.join(dummies_enbarked)

X.head()

### Training model

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.28,random_state = 101)

lg = LogisticRegression()
lg.fit(X_train,y_train)

y_pred = lg.predict(X_test)

### Evaluating results

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

acc = accuracy_score(y_test,y_pred)
report = classification_report(y_test,y_pred)
conf = confusion_matrix(y_test,y_pred)

print(conf)
print('\n')
print('==========================Classification report=========================')
print(report)
print('Prediction accuracy:{0}'.format(acc))

### Tuning parametrs

In [None]:
from sklearn.grid_search import GridSearchCV

param_grid = {'C':[1,0.1,0.01,0.001,0.0001],'penalty':['l2','l1']}
grid = GridSearchCV(LogisticRegression(),param_grid=param_grid,verbose = 6)
grid = grid.fit(X_train,y_train)

predicted_class = pd.DataFrame(grid.predict(test.drop('PassengerId',1)))
predicted_proba = pd.DataFrame(grid.predict_proba(test.drop('PassengerId',1)))

### Prediction

In [None]:
prediction = pd.concat([test['PassengerId'],predicted_class,predicted_proba],1)

prediction.columns = ['PassengerId','Predicted Class','Prob to be a 0', 'Prob to be a 1']

prediction[['Prob to be a 0', 'Prob to be a 1']] = prediction[['Prob to be a 0', 'Prob to be a 1']].applymap(lambda x: round(x,2))


In [None]:
prediction.head()