Titanic Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
titanic_df= pd.read_csv('../input/titanic_data.csv')

In [3]:
titanic_df.info()

In [4]:
titanic_df.head()

In [5]:
titanic_df.describe()

In [6]:
titanic_df.describe(include=['O'])

In [7]:
titanic_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

In [8]:
titanic_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [9]:
titanic_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).count()

In [10]:
titanic_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).sum()

In [11]:
sns.countplot(x='Survived',data=titanic_df)

In [12]:
titanic_df['Survived'].value_counts()

In [13]:
titanic_df['Sex'].value_counts()

In [14]:
sns.countplot(x='Survived',data=titanic_df,hue='Sex')

In [15]:
titanic_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [16]:
titanic_df['Pclass'].value_counts()

In [17]:
sns.countplot(x='Survived', data=titanic_df, hue='Pclass')

In [18]:
titanic_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

In [19]:
sns.distplot(titanic_df['Age'].dropna(), kde=False, bins=30)

In [20]:
sns.countplot(x='SibSp',data=titanic_df)

In [21]:
titanic_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

In [22]:
sns.countplot(x='Embarked',data=titanic_df)

In [23]:
titanic_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

In [24]:
plt.figure(figsize=(20,7))
sns.distplot(titanic_df['Fare'],kde=False, bins=30)

In [25]:
sns.boxplot(x='Pclass',y='Age',data=titanic_df)

In [26]:
titanic_df[['Pclass', 'Age']].groupby(['Pclass'], as_index=False).mean()

In [27]:
#CLEANING THE DATA

In [28]:
sns.heatmap(titanic_df.isnull(),yticklabels=False,cbar=False)

In [29]:
def input_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass ==1:
            return 38
        elif Pclass ==2:
            return 30
        else:
            return 25
    else:
        return Age

In [30]:
titanic_df['Age'] = titanic_df[['Age','Pclass']].apply(input_age,axis=1)

In [31]:
titanic_df[['Pclass', 'Age']].groupby(['Pclass'], as_index=False).mean()

In [32]:
titanic_df['Embarked'].fillna('S',inplace=True)

In [33]:
sns.heatmap(titanic_df.isnull(),yticklabels=False,cbar=False)

In [34]:
#Create Dummies

In [35]:
sex = pd.get_dummies(titanic_df['Sex'], drop_first=True)
embark = pd.get_dummies(titanic_df['Embarked'],drop_first=True)

In [36]:
titanic_df = pd.concat([titanic_df,sex,embark],axis=1)

In [37]:
titanic_df.head()

In [38]:
#titanic_df.drop(['PassengerId','Sex','Embarked','Fare','Age','Name','Cabin','Ticket'],axis=1,inplace=True)

In [39]:
titanic_df.drop(['Sex','Embarked','Fare','Age','Name','Cabin','Ticket'],axis=1,inplace=True)

In [40]:
titanic_df.head()

In [41]:
type(titanic_df)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(titanic_df.drop('Survived',axis=1), 
                                                    titanic_df['Survived'], test_size=0.30, 
                                                    random_state=101)

In [43]:
X_train.shape, y_train.shape, X_test.shape

In [44]:
logmodel = LogisticRegression()

In [45]:
logmodel.fit(X_train.drop('PassengerId',axis=1), y_train)

In [46]:
Predictions_log = logmodel.predict(X_test.drop('PassengerId',axis=1))

In [47]:
Predictions_log

In [48]:
logmodel.coef_

In [49]:
print(classification_report(y_test,Predictions_log))

In [50]:
print(confusion_matrix(y_test,Predictions_log))

In [51]:
# get Correlation Coefficient for each feature using Logistic Regression
coeff_df = pd.DataFrame(titanic_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logmodel.coef_[0])

# preview
coeff_df

In [52]:
logmodel.intercept_

In [53]:
#Logistic Score
logmodel.score(X_train.drop('PassengerId',axis=1), y_train)

In [54]:
submission_log = pd.DataFrame({
        "PassengerId": X_test['PassengerId'],
        "Survived": Predictions_log
    })

In [55]:
submission_log.head()

In [56]:
submission_log.to_csv('titanic_log.csv',index=False)

In [57]:
#RANDOM FOREST

In [58]:
random_forest = RandomForestClassifier(n_estimators=100)

In [59]:
random_forest.fit(X_train.drop('PassengerId',axis=1), y_train)

In [60]:
Predictions_random_forest = random_forest.predict(X_test.drop('PassengerId',axis=1))

In [None]:
Predictions_random_forest

In [None]:
Predictions_random_forest.shape

In [None]:
print(classification_report(y_test,Predictions_random_forest))

In [64]:
print(confusion_matrix(y_test,Predictions_random_forest))

In [65]:
random_forest.score(X_train.drop('PassengerId',axis=1), y_train)

In [66]:
submission_random_forest = pd.DataFrame({
        "PassengerId": X_test['PassengerId'],
        "Survived": Predictions_random_forest
    })

In [67]:
submission_random_forest.to_csv('titanic_rf.csv',index=False)