**Test with Logistic Regression, 
part1: data analysis and visualization
part2: data clean up
part3: training data splitting and testing
part4: prediction with testing data**
>  ***Don't know why the accuracy vs. gender_submission=95%, but in real submission is only 77%

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Data Analysis**

In [None]:
train = pd.read_csv('../input/train.csv')
train.head()

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train,palette='RdBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train,palette='rainbow')

In [None]:
sns.distplot(train['Age'].dropna(),kde=False,color='darkred',bins=30)

In [None]:
sns.countplot(x='SibSp',data=train)

In [None]:
train['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
train['Fare'].iplot(kind='hist',bins=30,color='green')

**Data Cleaning**

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')

In [None]:
Pclass1_mean=train[train['Pclass']==1].mean()['Age']
Pclass1_mean

In [None]:
Pclass2_mean=train[train['Pclass']==2].mean()['Age']
Pclass2_mean

In [None]:
Pclass3_mean=train[train['Pclass']==3].mean()['Age']
Pclass3_mean

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return Pclass1_mean

        elif Pclass == 2:
            return Pclass2_mean

        else:
            return Pclass3_mean

    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
train.drop('Cabin',axis=1,inplace=True)

In [None]:
sns.countplot(x='Embarked',data=train,palette='RdBu_r')

In [None]:
train['Embarked'].fillna('S', inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

****Converting categorical features******

In [None]:
train.info()

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [None]:
train.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train = pd.concat([train,sex,embark],axis=1)

In [None]:
train.head()

**LogisticRegression**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',axis=1), 
                                                    train['Survived'], test_size=0.30, 
                                                    random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print (classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

**Evaluate Test Datset**

In [None]:
evaluate = pd.read_csv('../input/test.csv')
evaluate.head()

In [None]:
sns.heatmap(evaluate.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
evaluate['Age'] = evaluate[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
evaluate.drop('Cabin',axis=1,inplace=True)

In [None]:
Pclass1_fmean=evaluate[evaluate['Pclass']==1].mean()['Fare']
Pclass1_fmean

In [None]:
Pclass2_fmean=evaluate[evaluate['Pclass']==2].mean()['Fare']
Pclass2_fmean

In [None]:
Pclass3_fmean=evaluate[evaluate['Pclass']==3].mean()['Fare']
Pclass3_fmean

In [None]:
def impute_fare(cols):
    Fare = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Fare):

        if Pclass == 1:
            return Pclass1_fmean

        elif Pclass == 2:
            return Pclass2_fmean

        else:
            return Pclass3_fmean

    else:
        return Fare

In [None]:
evaluate['Fare'] = evaluate[['Fare','Pclass']].apply(impute_fare,axis=1)

In [None]:
sns.heatmap(evaluate.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df_result=pd.DataFrame(evaluate['PassengerId'],columns=['PassengerId'])
df_result.head()

In [None]:
sex = pd.get_dummies(evaluate['Sex'],drop_first=True)
embark = pd.get_dummies(evaluate['Embarked'],drop_first=True)

In [None]:
evaluate.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
evaluate = pd.concat([evaluate,sex,embark],axis=1)

In [None]:
evaluate.head()

In [None]:
evaluation_result = logmodel.predict(evaluate)

In [None]:
evaluation_result

In [None]:
df_result['Survived'] = pd.Series(evaluation_result, index=df_result.index)
df_result.head()

In [None]:
gender_submission = pd.read_csv('../input/gender_submission.csv')
gender_submission.head()

In [None]:
print (classification_report(gender_submission['Survived'] , df_result['Survived']))

In [None]:
# output data for submission in Kaggle
# df_result.to_csv('result.csv',index=False)