In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/titanic/titanic_train.csv')

In [None]:
df.head()

# Exploratory Data Analysis 

Check out the missing data

### Missing Data 

In [None]:
df.isnull()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False)

Roughly 20 percent of the Age data is missing. The proportion of Age missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later, or change it to another feature like "Cabin Known: 1 or 0"

In [None]:
sns.set_style('whitegrid')
sns.countplot(x=df['Survived'])

In [None]:
sns.countplot(x=df['Survived'],hue=df['Sex'],palette='RdBu_r')

In [None]:
sns.countplot(x=df['Survived'],hue=df['Pclass'],palette='rainbow')

In [None]:
sns.distplot(df['Age'].dropna(),kde=False)

In [None]:
df['Age'].hist(alpha=0.3)

In [None]:
sns.countplot(x=df['SibSp'],palette='rainbow')

In [None]:
df['Fare'].hist(bins=40,figsize=(8,4))

In [None]:

import cufflinks as cf
cf.go_offline()
df['Fare'].iplot(kind='hist',bins=30,color='green')

### Data Cleaning

We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers (imputation). However we can be smarter about this and check the average age by passenger class. For example:

In [None]:
plt.figure(figsize=(12,7))
sns.boxplot(x=df['Pclass'],y=df['Age'],palette='winter')

We can see the wealthier passengers in the higher classes tend to be older, which makes sense. We'll use these average age values to impute based on Pclass for Age.

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        if Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
df['Age'] = df[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(data=df.isnull(), yticklabels=False)

Drop cabin column as most of the data is missing

In [None]:
df.drop('Cabin',axis=1, inplace=True)

In [None]:
sns.heatmap(df.isnull(),yticklabels=False)

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

### Converting Categorical Features 

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [None]:
df.info()

In [None]:
sex = pd.get_dummies(df['Sex'], drop_first=True)
embark = pd.get_dummies(df['Embarked'], drop_first=True)

In [None]:
df.drop(['Sex','Embarked','PassengerId','Name','Ticket'], axis=1, inplace=True)

In [None]:
df.head()

## Building a Logistic Regression model

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived', axis=1), df['Survived'], test_size=0.3, random_state = 101)

### Training and Predicting 

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
accuracy = confusion_matrix(y_test, predictions)

In [None]:
print(accuracy)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

### Evaluation 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))