In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.head()

## Initial Data Cleaning:
Let's do the first things first. Drop passengerid from train and store the one from test; as that is needed for sample submission. The 'Survived' is the dependent target. In this competition, variables like age and cabin, not being present is also significant; so we will have to replace them tactfully to get some information out of them. 

In [None]:
test_ids = test_data['PassengerId'].tolist()
train_survived = train_data['Survived']
train_data = train_data.drop('Survived',axis = 1)
train_data['split'] = 'train'
test_data['split'] = 'test'
total_data = pd.concat([train_data,test_data[train_data.columns]],axis = 0,ignore_index = True)

In [None]:
total_data.isna().sum()

In [None]:
total_data['Age'] = total_data['Age'].fillna(-999)
total_data['Cabin'] = total_data['Cabin'].fillna('UNK')
total_data['Ticket'] = total_data['Ticket'].fillna('UNK')
total_data['Embarked'] = total_data['Embarked'].fillna('UNK')
total_data['Fare'] = total_data['Fare'].fillna(-999)

In [None]:
total_data.head()

In [None]:
total_data = total_data.drop('PassengerId',axis = 1)

In [None]:
total_data.Pclass.unique()

In [None]:
total_data.Sex.unique()

In [None]:
list(total_data.Ticket.unique())

In [None]:
for i in range(1,4):
    total_data['Is_Pclass_'+str(i)] = total_data['Pclass'].apply(lambda x:(x==i)*1.0)
for i in ['male','female']:
    total_data['Is_sex_'+i] = total_data['Sex'].apply(lambda x: (x==i)*1.0)
for i in ['S','C','Q']:
    total_data['Is_class_'+i] = total_data['Embarked'].apply(lambda x:(x==i)*1.0)
total_data['Is_ticket_unknown'] = total_data['Ticket'].apply(lambda x: (x=='UNK')*1.0)
total_data['Passenger_class_unknown'] = total_data['Embarked'].apply(lambda x: (x=='UNK')*1.0)
total_data['No_cabin'] = total_data['Cabin'].apply(lambda x: (x=='UNK')*1.0)
total_data['ticket_only_digit'] = total_data['Ticket'].apply(lambda x: x.isdigit()*1.0)
total_data['paris_bound'] = total_data['Ticket'].apply(lambda x: ('Paris' in x)*1.0)
total_data['soton_bound'] = total_data['Ticket'].apply(lambda x: ('SOTON' in x)*1.0+('STON' in x)*1.0)
total_data['PC_in_ticket'] = total_data['Ticket'].apply(lambda x: ('PC' in x)*1.0 +('P.C.' in x)*1.0)
total_data['A_in_ticket'] = total_data['Ticket'].apply(lambda x: ('A' in x)*1.0)
total_data['CA_in_ticket'] = total_data['Ticket'].apply(lambda x: ('CA' in x)*1.0)
total_data['wc_in_ticket'] = total_data['Ticket'].apply(lambda x: ('W./C.' in x)*1.0)
total_data['SC_in_ticket'] = total_data['Ticket'].apply(lambda x: ('SC' in x)*1.0)
#total_data['PP_in_ticket'] = total_data['Ticket'].apply(lambda x: ('P.P.' in x)*1.0)
#total_data['fcc_in_ticket'] = total_data['Ticket'].apply(lambda x: ('F.C.C.' in x)*1.0)
#total_data['soc_in_ticket'] = total_data['Ticket'].apply(lambda x: ('S.O.C.' in x)*1.0)
total_data['family_number'] = total_data['SibSp'] + total_data['Parch']

Sex and the class interaction features are very important. So let's create them.

In [None]:
x = 3
((x<4) and (x>2)) *1.0

In [None]:
cut_points = [-999,0,16,60,100]
label_names = ["Missing","Child","Adult","Senior"]
for i in range(len(label_names)):
    total_data['Is_Age_'+label_names[i]] = total_data['Age'].apply(lambda x: ((x>=cut_points[i]) and (x<cut_points[i+1]))*1.0)

In [None]:
total_data['Age'] = total_data['Age'].replace(-999,18)

In [None]:
total_data['first_class_female'] = total_data['Is_Pclass_1']*total_data['Is_sex_female']
total_data['first_class_child'] = total_data['Is_Pclass_1']*total_data['Is_Age_Child']
total_data['second_class_child'] = total_data['Is_Pclass_2']*total_data['Is_Age_Child']
total_data['second_class_female'] = total_data['Is_Pclass_1']*total_data['Is_sex_female']
total_data['Third_class_adult'] = total_data['Is_Pclass_3']*(1-total_data['Is_Age_Child'])
total_data['male_with_family'] = total_data['Is_sex_male']*total_data['family_number']

In [None]:
total_data.shape

In [None]:
total_data.columns

In [None]:
total_data['Embarked'].unique()

In [None]:
total_data = total_data.drop(['Pclass','Sex','Ticket','Embarked','Cabin'],axis = 1)

In [None]:
total_data.columns

In [None]:
list(total_data['Name'].unique())

For me, It doesn't seem currently that the name feature has any valuable information. So we are going to drop it.

In [None]:
total_data = total_data.drop('Name',axis = 1)

In [None]:
total_data.shape

In [None]:
total_data.columns

In [None]:
import matplotlib.pyplot as plt
plt.hist(total_data[total_data['Age']!=-999]['Age'])

In [None]:
train_data = total_data[total_data['split']=='train']
train_data = train_data.drop('split',axis = 1)
test_data = total_data[total_data['split']=='test']
test_data = test_data.drop('split',axis = 1)

In [None]:
train_data.columns

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import classification_report as creport
from sklearn.model_selection import train_test_split as tts
from xgboost import XGBClassifier as XGBC
from sklearn.svm import NuSVC

In [None]:
X_train,X_test,Y_train,Y_test = tts(train_data,train_survived,test_size = 0.2,random_state = 8080)

In [None]:
rfc = RFC(n_estimators = 128,max_depth = 6,criterion = 'entropy',
          min_samples_split = 5, max_features = 15,
          #class_weight = 'balanced',#commenting out as this decreased accuracy.
          oob_score = True,n_jobs = -1)
rfc.fit(X_train,Y_train)
Y_pred_train = rfc.predict(X_train)
Y_pred_test = rfc.predict(X_test)
print("for the train data:")
print(creport(Y_train,Y_pred_train))
print("for the test data:")
print(creport(Y_test,Y_pred_test))
print("the oob score is:")
print(rfc.oob_score_)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier as ETC
etc = ETC(n_estimators = 128,max_depth = 6,criterion = 'entropy',min_samples_split = 30,
          #class_weight = 'balanced',#commenting out as this decreased accuracy.
          bootstrap = True,
          oob_score = True,n_jobs = -1)
etc.fit(X_train,Y_train)
Y_pred_train = etc.predict(X_train)
Y_pred_test = etc.predict(X_test)
print("for the train data:")
print(creport(Y_train,Y_pred_train))
print("for the test data:")
print(creport(Y_test,Y_pred_test))
print("the oob score is:")
print(etc.oob_score_)

In [None]:
from lightgbm import LGBMClassifier as LGBC
lgbc = LGBC()
lgbc.fit(X_train,Y_train)
Y_pred_train = lgbc.predict(X_train)
Y_pred_test = lgbc.predict(X_test)
print("for the train data:")
print(creport(Y_train,Y_pred_train))
print("for the test data:")
print(creport(Y_test,Y_pred_test))

In [None]:
feature_importances = pd.DataFrame()
feature_importances['features'] = X_train.columns
feature_importances['feature_importance'] = rfc.feature_importances_

sort only important features and retrain random forest model.

In [None]:
feature_importances

In [None]:
important_columns = feature_importances[feature_importances['feature_importance']>0.001]['features'].tolist()

In [None]:
"""
#commenting it out as for us random forest is the best model.
#this is not a good model actually as this performed much worse: 0.78013.
#probably this is caused by the fact that we reduced too many features.
rfc = RFC(n_estimators = 128,max_depth = 10,
          #criterion = 'entropy',
          min_samples_split = 15,
          #class_weight = 'balanced',#commenting out as this decreased accuracy.
          oob_score = True,n_jobs = -1)
rfc.fit(X_train[important_columns],Y_train)
Y_pred_train = rfc.predict(X_train[important_columns])
Y_pred_test = rfc.predict(X_test[important_columns])
print("for the train data:")
print(creport(Y_train,Y_pred_train))
print("for the test data:")
print(creport(Y_test,Y_pred_test))
print("the oob score is:")
print(rfc.oob_score_)
"""

In [None]:
xgbc = XGBC(n_estimators=900,
            learning_rate = 0.1,
            max_depth = 6,
            reg_lambda = 100,
            reg_alpha = 5,
            scale_pos_weight = 1.33,
            n_jobs = -1)
xgbc.fit(X_train,Y_train)
Y_pred_train = xgbc.predict(X_train)
Y_pred_test = xgbc.predict(X_test)
print("for the train data:")
print(creport(Y_train,Y_pred_train))
print("for the test data:")
print(creport(Y_test,Y_pred_test))

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
print(submission.columns)

In [None]:
test_prediction = rfc.predict(test_data)
first_submission = pd.DataFrame()
first_submission['PassengerId'] = test_ids
first_submission['Survived'] = test_prediction
first_submission.to_csv('third_randomforest_submission.csv',index = False)

In [None]:
test_prediction = lgbc.predict(test_data)
first_submission = pd.DataFrame()
first_submission['PassengerId'] = test_ids
first_submission['Survived'] = test_prediction
first_submission.to_csv('first_lgbm_submission.csv',index = False)