In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as py
import plotly.express as go

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.head()

Oh look it's the Titanic Dataset!

# Data exploration

In [None]:
train.info()

Except this time there are a 100k rows that have been generated. Hopefully this means that people can't cheat their way into getting an accuracy of 1 on the test set.

Right off the bat we can see that Age, Ticket, Fare, Cabin, and Embarked have missing values. Seems like this dataset follows the patterns of the original pretty closely. 

In [None]:
is_na = pd.DataFrame(train.isna().sum(),columns=['Number'])
is_na['Percent'] = is_na['Number']/100000
is_na = is_na.sort_values('Percent',ascending=False)
is_na

The columns with the highest missing values is Cabin (67.86% data points missing). In the original dataset this was a good column for analysis because there were only a 1000 data points, we will see later on if it's the same here.

In [None]:
print("The number of unique values in the column Cabin are:", train['Cabin'].nunique())

In [None]:
train['Cabin']

It seems like the Cabin column has an alphabet followed by numbers. While the numbers seem useless, the alphabet could be important. We could try isolating them and seeing what we find.

In [None]:
train['Cabin_alpha'] = train['Cabin'].str.replace('[^a-zA-Z]', '')
test['Cabin_alpha'] = test['Cabin'].str.replace('[^a-zA-Z]', '')

In [None]:
train['Cabin_alpha'].value_counts()

In [None]:
train['Cabin_alpha'].isna().sum()

In [None]:
train['Cabin_alpha'].fillna('NA',inplace=True)
test['Cabin_alpha'].fillna('NA',inplace=True)

In [None]:
train.groupby('Cabin_alpha')['Pclass'].value_counts()

In [None]:
train.groupby('Cabin_alpha')['Survived'].mean().sort_values()

In [None]:
go.bar(train.groupby('Cabin_alpha')['Survived'].mean().sort_values())

Seems like the updated Cabin column would be a good feature since the different cabin values seem to have variable survival rates.

In [None]:
print("The number of unique values in the column Ticket are:", train['Ticket'].nunique())

The ticket column is similar to the Cabin column, it has letters followed by numbers, or just numbers. Let's try to break this down.

In [None]:
train['Ticket_alpha'] = train['Ticket'].str.replace('[^a-zA-Z]', '')
test['Ticket_alpha'] = test['Ticket'].str.replace('[^a-zA-Z]', '')

In [None]:
train['Ticket_num'] = train['Ticket'].str.replace('[^0-9]', '')
test['Ticket_num'] = test['Ticket'].str.replace('[^0-9]', '')

In [None]:
train['Ticket_alpha'].fillna('NA',inplace=True)
train['Ticket_alpha'].replace({'':'NA'},inplace=True)
test['Ticket_alpha'].fillna('NA',inplace=True)
test['Ticket_alpha'].replace({'':'NA'},inplace=True)

In [None]:
train['Ticket_num'].fillna('0',inplace=True)
train['Ticket_num'].replace({'':'0'},inplace=True)
test['Ticket_num'].fillna('0',inplace=True)
test['Ticket_num'].replace({'':'0'},inplace=True)

In [None]:
train.head()

In [None]:
train['Ticket_num'] = train['Ticket_num'].astype(int)
test['Ticket_num'] = test['Ticket_num'].astype(int)

In [None]:
train[['Ticket_alpha','Ticket_num']].dtypes

Now our Ticket and Cabin columns are sorted. We can drop the original columns.

In [None]:
train.drop(columns=['Cabin','Ticket'],inplace=True)
test.drop(columns=['Cabin','Ticket'],inplace=True)

# EDA

In [None]:
x = pd.DataFrame(train.groupby('Pclass')['Cabin_alpha'].value_counts())
x.columns = ['Counts']
x = x.reset_index()

In [None]:
x['Percentage']=train.groupby('Pclass')['Cabin_alpha'].value_counts().groupby(level=0).apply(lambda 
        x:100 * x/float(x.sum())).values

In [None]:
go.bar(x,x='Pclass',y='Percentage',color='Cabin_alpha')

The NA class dominates the 2nd and 3rd Passenger Class. This could mean that people in those classes did not get a Cabin and there's no available record for them.

In [None]:
fig = go.bar(train['Pclass'].value_counts())
fig.update_layout(title='Number of passengers in each Class',xaxis_title='Pclass',yaxis_title='Number of passengers')
fig.show()

There isn't too much disparity within the counts of the classes, which is good because there will not be an imbalance in the data.

In [None]:
y = pd.DataFrame(train.groupby('Pclass')['Sex'].value_counts())
y.columns = ['Counts']
y = y.reset_index()
y['Percentage']=train.groupby('Pclass')['Sex'].value_counts().groupby(level=0).apply(lambda 
        x:100 * x/float(x.sum())).values
y['Pclass'] = y['Pclass'].astype('category')

In [None]:
go.bar(y,x='Pclass',y='Percentage',color='Sex')

Classes 1 and 2 have the same male-female ratio but Class 3 has significantly more males.

In [None]:
z = pd.DataFrame(train.groupby(['Pclass','Sex'])['Survived'].mean()).reset_index()

In [None]:
go.bar(z,x='Pclass',y='Survived',color='Sex',barmode='group')

* In all the classes, it is very clear that a significant amount of women survived, as opposed to men. On average, over 70% of the females survived, whereas only 20% of the males survived.
* Class also plays an important role as more men from Class 1 survive as opposed to Classes 2 and 3.

In [None]:
sns.boxplot(x=train['Survived'],y=train['Age'])

The continous feature age does not differ too much in the plot above. The median age of people who surived is higher, which indicates that older people probably had a higher survival rate.

To try to dive into this further, I will create age bands with 3 labels: Child, Adult, and Old. Children will be within the age of 0-15, Adults will be from 15-50, and Old will be any age above that.

In [None]:
age = train[['Age','Survived','Sex']].dropna()
bins = [0,15, 50, 200]
labels = ['Child', 'Adult', 'Old']
age['age_band'] = pd.cut(age.Age, bins, labels = labels,include_lowest = True)
go.bar(age.groupby(['age_band'])['Survived'].mean())


Children and Old people have a higher chance of survival as compared to adults, which makes sense.

In [None]:
for i in ['Embarked','Sex','Pclass']:
    print('Value counts for column',i,'are:')
    print(train[i].value_counts())
    print('-'*50)

The categorical columns have a good number of values for each unique item, which will be good when a model is being made

In [None]:
train['Name'].head(10)

The original Titanic dataset had titles (Mr,Mrs,Officer etc) attached with the names. Since this dataset doesn't, there isn't any insight to be gained other than gender, so this column is useless.

In [None]:
train.drop(columns = 'Name',inplace=True)

In [None]:
train.head()

# Missing Values

In [None]:
is_na = pd.DataFrame(train.isna().sum(),columns=['Number'])
is_na['Percent'] = is_na['Number']/100000
is_na = is_na.sort_values('Percent',ascending=False)
is_na

In [None]:
train['Age'].hist()

In [None]:
print('Age mean;',train['Age'].mean())
print('Age median:',train['Age'].median())
print('Age skew:',train['Age'].skew())

In [None]:
print(train.groupby('Survived')['Age'].mean())
print('-'*50)
print(train.groupby('Survived')['Age'].median())

While I feel like this would be an interesting way to approach this problem, it doesn't make sense because we cannot fill in the test values based on survival. So I will look at other methods.

In [None]:
print(train.groupby('Pclass')['Age'].mean())
print('-'*50)
print(train.groupby('Pclass')['Age'].median())

In [None]:
for df in [train,test]:
    for i in [1,2,3]:
        a = df[df['Age'].isna()][['Pclass','Age']]
        ind = list(a[a['Pclass']==i].index)
        df.loc[ind,'Age'] = df[df['Pclass']==i]['Age'].mean()

The above code finds the indices all the rows which have a class of 1/2/3 and have missing values and inputs the averages of the dataset into them.

In [None]:
train['Age'].skew()

Since we are inputting the means, the skew remains about the same

In [None]:
for df in [train,test]:
    df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

Filling Embarked column with the mode, which is port S

In [None]:
train['Fare'].hist()
print('Mean:',train['Fare'].mean())
print('Median:',train['Fare'].median())

The Fare column is highly skewed to the right

In [None]:
train.groupby('Pclass')['Fare'].mean()

In [None]:
train.groupby('Pclass')['Fare'].median()

I will impute the missing values the same way I did with age. With the median of the classes

In [None]:
for df in [train,test]:
    for i in [1,2,3]:
        a = df[df['Fare'].isna()][['Pclass','Fare']]
        ind = list(a[a['Pclass']==i].index)
        df.loc[ind,'Fare'] = df[df['Pclass']==i]['Fare'].median()

In [None]:
is_na = pd.DataFrame(train.isna().sum(),columns=['Number'])
is_na['Percent'] = is_na['Number']/100000
is_na = is_na.sort_values('Percent',ascending=False)
is_na

# Feature Engineering

In [None]:
train.head()

In [None]:
train.head()

In [None]:
train.groupby('Cabin_alpha')['Fare'].describe()

In [None]:
sns.heatmap(train.corr(),annot=True,cmap='crest')

From the correlation table above:- 
* Survived has a negative relationship with Pclass, which we saw earlier in the bar plots
* Survived has a positive relationship with Fare, which also makes sense since people in higher classes pay more
* Sibling/Parent columns by themselves have very little correlation with survival so maybe this can be looked into further

In [None]:
train['Family_members'] = train['SibSp'] + train['Parch']
test['Family_members'] = test['SibSp'] + test['Parch']

In [None]:
train['Alone'] = 0
test['Alone'] = 0
train.loc[train['Family_members']==0,'Alone'] = 1
test.loc[test['Family_members']==0,'Alone'] = 1
    

In [None]:
sns.heatmap(train.corr(),annot=True,cmap='crest')

In [None]:
train.columns

In [None]:
fig = go.bar(train.drop(columns=['PassengerId']).corr().loc['Survived','Pclass':'Alone'])
fig.update_layout(title='Correlation of features with Surival',xaxis_title='Features',yaxis_title='Correlation value')
fig.show()

In [None]:
train.skew()

There are only 2 continous columns, Age and Fare. Fare is highly skewed while Age is quite already. Thus, I will take the log of the Fare column to make it more symmetrical. 

In [None]:
train['Fare'] = np.log(train['Fare'])
test['Fare'] = np.log(test['Fare'])

In [None]:
train['Fare'].skew()

Problem sort of solved!

In [None]:
print(set(train.columns) - set(test.columns))
print(set(test.columns) - set(train.columns))

* This piece of code tells us that in the train df, Survived is a column that isn't there in the test df.
* Similarly, test df has the Name column which is not in the train df

In [None]:
test.drop(columns='Name',inplace=True)

In [None]:
train.groupby('Cabin_alpha')['Fare'].describe().T

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [None]:
train[train.dtypes[train.dtypes == object].index].nunique()

In [None]:
for i in ['Cabin_alpha','Ticket_alpha']:
    train[i] = lb.fit_transform(train[i])
    test[i] = lb.transform(test[i])

In [None]:
train.head()

In [None]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [None]:
train.drop(columns=['Sex_male'],inplace=True)
test.drop(columns=['Sex_male'],inplace=True)

In [None]:
import statsmodels.api as sm

In [None]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [None]:
features = forward_selection(train.drop(columns=['PassengerId','Survived']),train['Survived']) 

In [None]:
features

In [None]:
train.head()

In [None]:
id_col = test['PassengerId']

In [None]:
for df in [train,test]:
    df.drop(columns='PassengerId',inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(max_iter=1000)

In [None]:
rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=10, min_samples_split=100)

In [None]:
X = train.drop(columns='Survived')
y = train['Survived']

In [None]:
X.head()

In [None]:
 lr.fit(X.drop(columns=['Ticket_alpha','Ticket_num','Family_members']),y)

In [None]:
cv_means = cross_val_score(lr,X.drop(columns=['Ticket_alpha','Ticket_num','Family_members']),y)

In [None]:
np.mean(cv_means)

In [None]:
submission = pd.DataFrame({'PassengerID':id_col,'Survived':lr.predict(test.drop(columns=['Ticket_alpha','Ticket_num','Family_members']))})
submission = submission.set_index('PassengerID')

In [None]:
submission.to_csv('submission.csv')

This scored 0.79 on the leaderboards, which is good for a simple model!