In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# 1. Let us first load the data and check what it contains

In [1]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_full=test.copy(deep=True)
train.head()

Check for missing data

In [1]:
train.info()

Let us drop the column 'Cabin' since it contains a lot of missing data. Also Name and Ticket are unique values per passenger so we can drop these.

In [1]:
train.drop(columns=['Cabin','Name','Ticket','PassengerId'],inplace=True)
test.drop(columns=['Cabin','Name','Ticket','PassengerId'],inplace=True)

train.info()

In [1]:
test.info()

Let us fill the null values of Age with the median and the null values for Embarked with the mode. For Fare in test data, we fill using median also.

In [1]:
train.Age.fillna(train.Age.median(),inplace=True)
test.Age.fillna(test.Age.median(),inplace=True)

test.Fare.fillna(test.Age.median(),inplace=True)

train.Embarked.fillna(train.Embarked.mode()[0],inplace=True)
test.Embarked.fillna(test.Embarked.mode()[0],inplace=True)

Then, let us convert the categorical values into numbers. We use LabelEncoder on this.
We have two features to transform: Sex and Embarked

In [1]:
labeler = LabelEncoder()

train['Sex']=labeler.fit_transform(train['Sex'])
train['Embarked']=labeler.fit_transform(train['Embarked'])

test['Sex']=labeler.fit_transform(test['Sex'])
test['Embarked']=labeler.fit_transform(test['Embarked'])

Separate the independent and dependent variables.

In [1]:
y_train = train['Survived']
X_train = train.drop(columns='Survived')

# 2. Visualizations

In [1]:
plt.figure(figsize=(10,6))
corr = train.corr()
sns.heatmap(abs(corr),cmap='Blues',annot=True)

### Takeaway: Sex and Pclass has the strongest correlation to Survival.

I am planning to use Logistic Regression for this problem. So let us check if multi-collinearity exists using VIF.

In [1]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif["features"] = X_train.columns
vif

### Takeaway: Since no feature has VIF Factor>10, we can be confident to retain all features.

# 3. Model

First we scale the data and run a quick and dirty model.

In [1]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)

In [1]:
lr = linear_model.LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_train,y_train)

In [1]:
predictions = lr.predict(test)
output=pd.DataFrame({'PassengerId': test_full.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)