In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pwd

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

Import Data

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
print(train.shape)
train.head()

In [None]:
train.tail()

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
print(test.shape)
test.head()

In [None]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv")
submission.head()

In [None]:
sub_id = submission[['PassengerId']]

Let's check the null ratio at first. Then, handle the missing rows. 

In [None]:
# Find the missing values as a percentage in each column
def findMissingPercent(training):
    percent_missing = training.isnull().sum() * 100 / len(training)
    missing_value_df = pd.DataFrame({'column_name': training.columns,
                                     'percent_missing': percent_missing})
    missing_value_df.sort_values(by = 'percent_missing', ascending = False, inplace= True)
    
    return missing_value_df.reset_index(drop=True)

missing = findMissingPercent(train)
missing

In [None]:
missing_test = findMissingPercent(test)
missing_test

Feature Analysis

#F1 Cabin

In [None]:
print("The Number of Unique Elements : {}".format(len(train['Cabin'].unique())))
pd.DataFrame(train['Cabin'].value_counts()).head()

#There are a lot of distinct numbers so decided to use the first character by filling "Other:XX" class for null rows. 
# So that we will have less number of category. 

In [None]:
# fill missing value as a new category
train['Cabin'] = train['Cabin'].fillna("XX")

# make that column as string
train['Cabin'] = train['Cabin'].astype(str)

# extract the first digit
train['Cabin_fd'] = train['Cabin'].apply(lambda x: (x[:1]))

# take value counts
train['Cabin_fd'].value_counts()

In [None]:
# Apply the same strategy for the test set. 
test['Cabin'] = test['Cabin'].fillna("XX")

# make that column as string
test['Cabin'] = test['Cabin'].astype(str)

# extract the first digit
test['Cabin_fd'] = test['Cabin'].apply(lambda x: (x[:1]))

# take value counts
test['Cabin_fd'].value_counts()

#F2 Ticket

In [None]:
# I could not find any idea for the ticket no. I'll skip for now :) 
print("The Number of Unique Elements : {}".format(len(train['Ticket'].unique())))
pd.DataFrame(train['Ticket'].value_counts()).head()

#F3 Age

In [None]:
print("The Number of Unique Elements : {}".format(len(train['Age'].unique())))
pd.DataFrame(train['Age'].value_counts()).head()

# I will fill null rows in the Age column by looking at pcass and gender information. 

In [None]:
# Generate a reference for missing age value 
missing_age = train.groupby(['Pclass', 'Sex']).agg({'Age':'mean'})
missing_age

In [None]:
# Fill based on the reference table. 
train['Age_filled'] = train.apply(
    lambda row: missing_age['Age'][(row['Pclass'], row['Sex'])] if np.isnan(row['Age']) else row['Age'],
    axis=1
)

In [None]:
# Apply the same method for test set. 
test['Age_filled'] = test.apply(
    lambda row: missing_age['Age'][(row['Pclass'], row['Sex'])] if np.isnan(row['Age']) else row['Age'],
    axis=1
)

#F4 Embarked

In [None]:
print("The Number of Unique Elements : {}".format(len(train['Embarked'].unique())))
pd.DataFrame(train['Embarked'].value_counts()).head()

# There are small number of categories. I will fill the missing data with the most frequent element "S". 
train['Embarked']

In [None]:
# Fill based on the reference table. 
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode().iloc[0])

In [None]:
# Apply the same method for test set. 
test['Embarked'] = test['Embarked'].fillna(train['Embarked'].mode().iloc[0])

#F5 Fare 

In [None]:
# Plot the histogram 
train.hist(column='Fare', bins = 50)

In [None]:
train.head()

In [None]:
# Generate a reference for missing Fare value 
missing_fare = train.groupby(['Pclass', 'Embarked']).agg({'Fare':'mean'})
missing_fare

In [None]:
# Fill based on the reference table. 
train['Fare_filled'] = train.apply(
    lambda row: missing_fare['Fare'][(row['Pclass'], row['Embarked'])] if np.isnan(row['Fare']) else row['Fare'],
    axis=1
)
train.head()

In [None]:
# Apply the same method for the test set. 
test['Fare_filled'] = test.apply(
    lambda row: missing_fare['Fare'][(row['Pclass'], row['Embarked'])] if np.isnan(row['Fare']) else row['Fare'],
    axis=1
)
test.head()

Until now, we have completed the missing value problem. In the next section, we may add new features. I skipped this part for now. Firsty, I will create a baseline model. 

In [None]:
train.head()

#Find the correlation between all features and target

In [None]:
train.corr(method ='pearson')
# There is a correlation for Fare (positive) and Age (positive) and Pclass (negative)
# We may use this information to generate new features. 

In [None]:

# calculate the correlation matrix
corr = train.corr(method ='pearson')

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

Prepare the Training Data, Test and Validation Data

In [None]:
X = train[['Pclass','Sex','Age_filled','SibSp', 'Parch','Fare_filled', 'Cabin_fd','Embarked']]
y = train[['Survived']]
print("Training Data Feature Shape {}".format(X.shape))
print("Training Data Target Shape {}".format(y.shape))

submission = test[['Pclass','Sex','Age_filled','SibSp', 'Parch','Fare_filled', 'Cabin_fd','Embarked']]

print("Submission Data Shape {}".format(submission.shape))


Deal With Categorical Features 

In [None]:
X.head()

In [None]:
def applyOneHot(X,col):

    # Apply one-hot encoding for the training data. 
    enc_dow = OneHotEncoder(handle_unknown='ignore')

    enc_df_dow = pd.DataFrame(enc_dow.fit_transform(X[[col]]).toarray())
    enc_df_dow.columns = enc_dow.get_feature_names([col])

    print("Encoded Shape is {} for {} \n".format(enc_df_dow.shape, col))
    
    
    # merge with main df bridge_df on key values
    #X_ = X.join(enc_df)
    X = pd.concat([X, enc_df_dow], axis = 1 )
    
    return enc_dow, X


one_sex, X = applyOneHot(X,'Sex')
del X['Sex']
X.head()

In [None]:
one_cabin, X = applyOneHot(X,'Cabin_fd')
del X['Cabin_fd']

one_embark, X = applyOneHot(X,'Embarked')
del X['Embarked']

X.head()

In [None]:
# Apply one-hot encoding for the test data. 

def encodeTestSet(enc, df, col):
    
    enc_df_dow_test = pd.DataFrame(enc.transform(df[[col]]).toarray())
    enc_df_dow_test.columns = enc.get_feature_names([col])
    print("Encoded Shape is {} \n".format(enc_df_dow_test.shape))

    # Merge with main df bridge_df on key values
    df = pd.concat([df, enc_df_dow_test], axis = 1 )
    return df 

# Encode Gender
submission = encodeTestSet(one_sex, submission, 'Sex')

# Encode Cabin
submission = encodeTestSet(one_cabin, submission, 'Cabin_fd')

# Encode Gender
submission = encodeTestSet(one_embark, submission, 'Embarked')

del submission['Cabin_fd']
del submission['Embarked']
del submission['Sex']

submission.head()

In [None]:
# Apply test train and validation split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=121)

print("Training Ratio   : {}%".format(X_train.shape[0]*100 /X.shape[0]))
print("Test Ratio       : {}%".format(X_test.shape[0]*100 /X.shape[0] ))

# We obtained %85-%15 distribution. 

In [None]:
# There is no imbalance between ones and zeros in both dataset. 
print(y_test[y_test['Survived'] == 0].shape[0]*100 / y_test.shape[0])
print(y_train[y_train['Survived'] == 0].shape[0]*100 / y_train.shape[0])

Define a RF Model

In [None]:
# I will test the number of estimators and max_depth. 
# You can also use GridSearch or RandomSearch in this step. 

n_estimators = [50,100,500]
max_depth= [5,10,20]

finalScores = pd.DataFrame(columns = ['n_estimators','max_depth', 'mean_accuracy', 'std_accuracy'])

def performRF(finalScores, X_train=X_train, y_train=y_train, n_splits = 5):
    # I will apply cross validation to get model performance from the training set. 
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cnt = 1
    
    for i in n_estimators: 
        for j in max_depth: 
            
            rf = RandomForestClassifier(
                n_estimators=i,
                max_depth=j
            )

            score = cross_val_score(rf, X_train, y_train, cv= kf, scoring="accuracy")
            #print(f'Scores for each fold are: {score}')
            #print("----")
            #print("n_estimators : {} and max_depth : {}".format(i,j))
            #print(f'Average score: {"{:.2f}".format(score.mean())}')  
            #print("----")
            
            series_obj = pd.Series( [i,j,score.mean(),score.std()], 
                        index=finalScores.columns )
            
            # Add a series as a row to the dataframe  
            finalScores = finalScores.append(series_obj,
                                    ignore_index=True)
            
            print("No {} completed. (Out of {} )".format(cnt, len(n_estimators)*len(max_depth)))

            cnt = cnt + 1 
            
            
    return finalScores

finalScores = performRF(finalScores, X_train=X_train, y_train=y_train, n_splits = 5)

In [None]:
finalScores

#Perform Random Forest Model

In [None]:
# create an RF Model 
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_features= 10
)

# Fit the model 
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)
y_pred_tr = rf.predict(X_train)

# save prediction probabilities 
preds = rf.predict_proba(X_test)[:,1]


print("Training Accuracy : {}".format(accuracy_score(y_train,y_pred_tr)))
print("Test Accuracy : {}".format(accuracy_score(y_test,y_pred)))
print("-- ")
print("Test Recall : {}".format(recall_score(y_test,y_pred)))
print("Test Confusion Matrix : \n {}".format(confusion_matrix(y_test,y_pred)))


# We obtained a low accuracy with RF. Let's optimize it with more model parameters. 


In [None]:
def plotROC(y_test, preds, name):
    
    # generate a no skill prediction (majority class)
    ns_probs = [0 for _ in range(len(preds))]

    # calculate scores
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, preds)
    
    # summarize scores
    print('Random Guess: ROC AUC={0:.2f}'.format(ns_auc)) 
    
    print(name + ': ROC AUC={0:.2f}'.format(lr_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, preds)
    # plot the roc curve for the model
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Random Guess')
    plt.plot(lr_fpr, lr_tpr, marker='.', label=name)
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the grid
    plt.grid(True)
    # show the legend
    plt.legend()
    # show the plot
    plt.show()
    
plotROC(y_test , preds, 'Random Forest')

#Perform LightGBM

In [None]:
# I will test the number of estimators and max_depth. 
# You can also use GridSearch or RandomSearch in this step. 

num_leaves = [200,500]
max_depth= [3,5,10]

    
finalScores = pd.DataFrame(columns = ['num_leaves','max_depth', 'mean_accuracy', 'std_accuracy'])

def performLGBM(finalScores, X_train=X_train, y_train=y_train, n_splits = 5,
                num_leaves=num_leaves,max_depth =max_depth ):
    # I will apply cross validation to get model performance from the training set. 
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cnt = 1
    
    for i in num_leaves: 
        for j in max_depth: 
            
            lg = lgb.LGBMClassifier(
                num_leaves=i,
                max_depth=j
            )


            score = cross_val_score(lg, X_train, y_train, cv= kf, scoring="accuracy")
            #print(f'Scores for each fold are: {score}')
            #print("----")
            #print("n_estimators : {} and max_depth : {}".format(i,j))
            #print(f'Average score: {"{:.2f}".format(score.mean())}')  
            #print("----")
            
            series_obj = pd.Series( [i,j,score.mean(),score.std()], 
                        index=finalScores.columns )
            
            # Add a series as a row to the dataframe  
            finalScores = finalScores.append(series_obj,
                                    ignore_index=True)
            
            print("No {} completed. (Out of {} )".format(cnt, len(num_leaves)*len(max_depth)))

            cnt = cnt + 1 
            
            
    return finalScores

finalScores = performLGBM(finalScores, X_train=X_train, y_train=y_train, n_splits = 5)

In [None]:
finalScores

In [None]:
# build the lightgbm model
lgbm = lgb.LGBMClassifier(num_leaves=200,max_depth=3,learning_rate = 0.1, num_iterations = 1000)
lgbm.fit(X_train, y_train)


y_pred = lgbm.predict(X_test)
y_pred_tr = lgbm.predict(X_train)

# save prediction probabilities 
preds = lgbm.predict_proba(X_test)[:,1]


print("Training Accuracy : {}".format(accuracy_score(y_train,y_pred_tr)))
print("Test Accuracy : {}".format(accuracy_score(y_test,y_pred)))
print("-- ")
print("Test Recall : {}".format(recall_score(y_test,y_pred)))
print("Test Confusion Matrix : \n {}".format(confusion_matrix(y_test,y_pred)))
print("--\n")

# We obtained a low accuracy with LightGBM. Let's optimize it with more model parameters. 

plotROC(y_test , preds, 'LightGBM')

I obtained similar results via RF and LightGBM. I will use the LightGBM for test. 

Also, you can observe the most important features. This can help us to improve our model. 

In [None]:
#import warnings
#warnings.simplefilter(action='ignore', category=FutureWarning)

def plotImp(model, X , num = 20):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=(40, 21))
    sns.set(font_scale = 4)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    #plt.savefig('lgbm_importances-01.png')
    plt.show()


plotImp(lgbm, X_train , num = 30)

In [None]:
import shap
#shap.initjs()

# compute SHAP values
explainer = shap.Explainer(lgbm, X_train)
shap_values = explainer(X_train)

In [None]:
# summarize the effects of all the features
shap.plots.beeswarm(shap_values , max_display = 20)

In [None]:
shap.plots.bar(shap_values , max_display = 20)

Prepare Submission File

In [None]:
submission_test = lgbm.predict(submission) 

In [None]:
sub_id['Survived'] = submission_test

In [None]:
sub_id.head()

In [None]:
sub_id.to_csv("submission__.csv", index = False, encoding = 'utf-8')

I used some part of my code from the below links.

#ref : https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/