In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

**Part 1- Data Cleaning and Visualization**

In [None]:
data = pd.read_csv('../input/train.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().head(20)

#Age has a few Nulls and Cabin has a lot of Nulls

In [None]:
#heatmap to visualize the missing values
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
sns.set_style('whitegrid')

In [None]:
sns.countplot(x='Survived', data=data)

In [None]:
sns.countplot(x='Survived', data=data, hue='Sex', palette='PRGn')

In [None]:
sns.countplot(x='Survived', data=data, hue='Pclass')

In [None]:
sns.distplot(data['Age'].dropna(), kde=False, bins=30)

In [None]:
sns.countplot(x='SibSp', data=data)

In [None]:
data['Fare'].hist(bins=40, figsize=(10,4))

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x='Pclass', y='Age', data=data)

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return 38
        elif Pclass == 2:
            return 29
        else:
            return 24
    return Age

In [None]:
data['Age'] = data[['Age','Pclass']].apply(impute_age, axis=1)

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# too many values are missing... can't really do anything with this column
data.drop('Cabin', axis=1, inplace=True)

In [None]:
#drop rows with any NAs if any are left
data.dropna(inplace=True)

In [None]:
#change categorical columns into 
sex = pd.get_dummies(data['Sex'],drop_first=True)
embark = pd.get_dummies(data['Embarked'],drop_first=True)

In [None]:
data=pd.concat([data, sex, embark], axis=1)

In [None]:
#drop all non numerical columns
data.drop(['Name', 'Sex','Ticket','Embarked'], axis=1, inplace=True)

In [None]:
#this column is essentailly the index of the DataFrame
data.drop(['PassengerId'], axis=1, inplace=True)

Now the DataFrame is ready to plug in to the machine learning algorithm. We removed the following:
1. Categorical Columsn
2. multicollinearity after converting categorical variables into dummy indicator variables (1s and 0s).
3. Numerical columns that don't add value (e.g. IDs that are essentially the index) 

**Also notice, Pclass is a categorical column (1,2,3)... we could also convert that into two 1/0 columns.**

In [None]:
data.head()

**Part 2- Machine Learning- Logistic Regression with sklearn**

In [None]:
X = data.drop(['Survived'], axis=1)
Y = data['Survived']

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=.3, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
pred= pd.DataFrame(predictions, columns=['Survived'])
pred['model']='Logistic Reg'

In [None]:
y_test_df= pd.DataFrame(y_test)
y_test_df.reset_index(inplace=True)
y_test_df.drop(['index'], axis=1, inplace=True)
y_test_df['model']='Test Data'

In [None]:
pred_test = pred.append(y_test_df)
pred_test.tail()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Survived', hue='model', data=pred_test)