In [None]:
# basic library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

##### Data Read

In [None]:
# read the train and test data
df_train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col = 0)
df_test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col = 0)

In [None]:
# check the shape for training set
df_train.shape

In [None]:
# check the shape for testing set
df_test.shape

In [None]:
# check the amount for duplicated sample in training set
print(f"Duplicated sample: {df_train.duplicated().sum()}")

In [None]:
# check the missing value in the training set
for i in df_train.columns:
    print(f"Missing value for {i}: {df_train[i].isna().sum()}")

#### Statistical analysis

##### Distribution

In [None]:
# train set
int_list = []
float_list = []
object_list = []
for i in df_train.columns:
    if df_train[i].dtypes == 'int64':
        int_list.append(i)
    elif df_train[i].dtypes == 'float64':
        float_list.append(i)
    else:
        object_list.append(i)

In [None]:
print(f"int type features: {int_list}")
print(f"float type features: {float_list}")
print(f"object type features: {object_list}")

In [None]:
# test set
int_list_test = []
float_list_test = []
object_list_test = []
for i in df_test.columns:
    if df_test[i].dtypes == 'int64':
        int_list_test.append(i)
    elif df_test[i].dtypes == 'float64':
        float_list_test.append(i)
    else:
        object_list_test.append(i)

In [None]:
print(f"int type features: {int_list_test}")
print(f"float type features: {float_list_test}")
print(f"object type features: {object_list_test}")

features for both train set and test set should be the same instead of the target column

##### int type

In [None]:
# checking the range for the int_type feature in train set
for i in int_list:
    temp_list = list(df_train[i].unique())
    temp_list.sort()
    print(f"{i}, min: {temp_list[0]}, max: {temp_list[-1]}, number of value: {len(temp_list)}")

By looking up the value of target, it was treated as a binary classification.

In [None]:
# checking the range for the int_type feature in test set
for i in int_list_test:
    temp_list_test = list(df_test[i].unique())
    temp_list_test.sort()
    print(f"{i}, min: {temp_list_test[0]}, max: {temp_list_test[-1]}, number of value: {len(temp_list_test)}")

In [None]:
# comparing the two data set
for i in int_list_test: # using test set to get rid of the target column
    temp_list = list(df_train[i].unique())
    temp_list_test = list(df_test[i].unique())
    print(f"Different value of {i}: {len(temp_list) - len(temp_list_test)}")

By checking these, there is some different for column: f_08, f_09, f_10, f_11, f_13, f_14, f_15, f_16.

Possible way to deal with: Elimination of those data to ensure the unity of the data between test set and train set

##### Visualisation of the data distribution

Bar chart was used for the int type data column

In [None]:
# train set
# check with the distribution for the int type features
# melt the data and build a counts column for visualisation
f = pd.melt(df_train, value_vars = int_list)
f['counts'] = 1
f = f.groupby(['variable','value']).sum()
ncols = 3
nrows = round(len(int_list) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
ax = axes.ravel()
for i in range(len(int_list)):
    ax[i].bar(data = f.loc[int_list[i]], x = f.loc[int_list[i]].index, height = 'counts')
    ax[i].set_title(int_list[i])

For f_7 to f_18, the skewness and distribution is similar. For f_29, it it a binary feature as well but the distribution is different and it is imbalanced. For f_30, the range of the value will be from 0 to 2 and the distribution is quite even.

For target, the distribution is quite balanced.

In [None]:
# test set
# check with the distribution for the int type features
# melt the data and build a counts column for visualisation
f = pd.melt(df_test, value_vars = int_list_test)
f['counts'] = 1
f = f.groupby(['variable','value']).sum()
ncols = 3
nrows = round(len(int_list_test) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
ax = axes.ravel()
for i in range(len(int_list_test)):
    ax[i].bar(data = f.loc[int_list_test[i]], x = f.loc[int_list_test[i]].index, height = 'counts')
    ax[i].set_title(int_list_test[i])

By checking the distribution for both train set and test set, the distribution is quite similar and it can be checked once the FE was done since there is some value missing in both set.

##### float type

Using describe function to check about the floating column

In [None]:
df_train[float_list].describe()

In [None]:
df_test[float_list_test].describe()

In [None]:
# comparing the max and min for both set by it's ratio
df_comparing = pd.DataFrame()
df_comparing['max_ratio'] = df_train[float_list].describe().T['max'] / df_test[float_list_test].describe().T['max']
df_comparing['min_ratio'] = df_train[float_list].describe().T['min'] / df_test[float_list_test].describe().T['min']
df_comparing

By comparing two data set, f_03, f_04, f_21, f_24, f_25 got a different ratio more than 10% no matter in maximum or minimum.

##### visualisation of float data

In [None]:
import math
# train set
ncols = 3
nrows = math.ceil(len(float_list) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
ax = axes.ravel()
for i in range(len(float_list)):
    # using histogram to visualize with auto bin width
    # plot for both train and testing set
    ax[i].hist(df_train[float_list[i]])
    ax[i].hist(df_test[float_list_test[i]])
    ax[i].set_title(float_list[i])

In [None]:
# let's switch it to kernal density estimation plot
import math
# train set
ncols = 3
nrows = math.ceil(len(float_list) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
ax = axes.ravel()
for i in range(len(float_list)):
    # plot the distribution for both train and test set
    ax[i] = sns.kdeplot(data = df_train, x = float_list[i], label = 'training', color = 'b', shade = False, ax = ax[i])
    ax[i] = sns.kdeplot(data = df_test,x = float_list_test[i], label = 'testing', color= 'r', shade = False, ax = ax[i])
    # show the legend for the labels
    ax[i].legend()
    ax[i].set_title(float_list[i])

By checking the kernal density estimation plot, there is no different in both train and test set.

There is some different by the visualisation using the histogram. 

It can be caused by the range of the value such as f_24 & f_25.

Let's check about the target distribution in these floating parameters.

In [None]:
# let's switch it to kernal density estimation plot
import math
# train set
ncols = 3
nrows = math.ceil(len(float_list) / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
ax = axes.ravel()
for i in range(len(float_list)):
    # show the distribution according to the target
    ax[i] = sns.kdeplot(data = df_train, x = float_list[i], hue = 'target', shade = True, ax = ax[i])
    ax[i].set_title(float_list[i])

There is a bit different for some parameters as it got a bit imbalanced. It have to be tackled by balancing the data or it can be ignore as the distribution for both training and testing set is quite similar. Hope the propability distribution for the machine model was balanced and there is no need for FE in these section.

##### Correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (15,8))
sns.heatmap(df_train.corr(), annot = False)

According to the heatmap, there is some correlation between f_28 and the other feature.

The coefficient of some features and target will be extracted and check with to milticolinearity.

##### f_07

In [None]:
# check the absolute linear coefficient with other for f_07
abs(df_train.corr()['f_07']).sort_values(ascending = False)[1:6]

##### f_28

In [None]:
# check the absolute linear coefficient with other for f_28
abs(df_train.corr()['f_28']).sort_values(ascending = False)[1:6]

##### f_30

In [None]:
# check the absolute linear coefficient with other for f_30
abs(df_train.corr()['f_30']).sort_values(ascending = False)[1:6]

##### target

In [None]:
# check the absolute linear coefficient with other for target
abs(df_train.corr()['target']).sort_values(ascending = False)[1:6]

Multicolinearity was found in some feature. Although linear regression may not be applied, it still needed to be fixed to help with the model performance.

#### Feature engineering

The first parameter will be f_27 as it is an object type column and it has to be converted into number before the work on ML part.

##### f_27 (object)

In [None]:
# determine the unique value in training set for f_27
len(df_train['f_27'].unique())

In [None]:
# determine the unique value in testing set for f_27
len(df_test['f_27'].unique())

The unique value for both train and test set is different.

Let's take a look for the data and find out any pattern can be used as a feature.

In [None]:
df_train['f_27']

There is different 10 alphbets in the sequence. The first FE will be the assignment of number for each alphbets and get a sum of the number.

Let's assign A = 0, B = 1, C = 2, ..., Y = 24, Z = 25 accordingly.

E.g. ABABABABAB -> 5

Let's build a dictionary to calculate the sum and create a new column called f_27_FE

In [None]:
f_27_dict = {
            'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E' : 4,
            'F' : 5, 'G' : 6, 'H' : 7, 'I' : 8, 'J' : 9,
            'K' : 10, 'L' : 11, 'M' : 12, 'N' : 13, 'O' : 14,
            'P' : 15, 'Q' : 16, 'R' : 17, 'S' : 18, 'T' : 19,
            'U' : 20, 'V' : 21, 'W': 22, 'X' : 23, 'Y' : 24, 'Z':25 
}

In [None]:
# create an empty series as a container
f_27_FE = pd.Series(dtype = 'int')
for sequence in df_train['f_27']:
    temp = 0
    for char in sequence:
        # sum up all the number according to the dict
        temp += f_27_dict[char]
    # update the series with latest value calculated for each rows
    temp = pd.Series(temp)
    # concat the container with the calculated value
    f_27_FE = pd.concat([f_27_FE, temp])

In [None]:
# create a copied for training set and 
new_train = df_train.copy()
# reset all the index and get ready to combine with original dataframe
f_27_FE = f_27_FE.reset_index(drop = True)
# join the dataframe with created feature
new_train['f_27_FE'] = f_27_FE
# remove the original column after FE
new_train.drop('f_27', axis = 1, inplace = True)

In [None]:
# review after the FE
new_train.head()

It takes a long time for FE and it needed to be iterated for the whole data set.

Let's check about the distribution and counting after the FE.

In [None]:
# check the range for the latest FE column
temp_list = list(new_train['f_27_FE'].unique())
temp_list.sort()
print(f"f_27_FE, min: {temp_list[0]}, max: {temp_list[-1]}, number of value: {len(temp_list)}")

In [None]:
# check with the distribution for f_27_FE
# melt the data and build a counts column for visualisation
f = pd.melt(new_train, value_vars = 'f_27_FE')
f['counts'] = 1
f = f.groupby(['value']).sum()
plt.bar(data = f, x = f.index, height = 'counts');

Let's work for the testing set and perform the same FE.

In [None]:
# create an empty series as a container for test set
f_27_FE = pd.DataFrame(dtype = 'int')
for sequence in df_test['f_27']:
    temp = 0
    for char in sequence:
        # sum up all the number according to the dict
        temp += f_27_dict[char]
    # update the series with latest value calculated for each rows    
    temp = pd.Series(temp)
    # concat the container with the calculated value
    f_27_FE = pd.concat([f_27_FE, temp])

In [None]:
# create a copied for training set and 
new_test = df_test.copy()
# reset all the index and get ready to combine with original dataframe
f_27_FE = f_27_FE.set_index(df_test.index, drop = True)
# join the dataframe with created feature
new_test['f_27_FE'] = f_27_FE[0]
new_test['f_27_FE'] = new_test['f_27_FE'].astype('int64')
# remove the original column after FE
new_test.drop('f_27', axis = 1, inplace = True)

In [None]:
# check the range for the latest FE column
temp_list = list(new_test['f_27_FE'].unique())
temp_list.sort()
print(f"f_27_FE, min: {temp_list[0]}, max: {temp_list[-1]}, number of value: {len(temp_list)}")

After the FE, the number of value for f_27_FE in testing set is the same as training set.

Let's take a look for the distribution.

In [None]:
# check with the distribution for f_27_FE
# melt the data and build a counts column for visualisation
f = pd.melt(new_test, value_vars = 'f_27_FE')
f['counts'] = 1
f = f.groupby(['value']).sum()
plt.bar(data = f, x = f.index, height = 'counts');

The distribution for both training set and testing set is similar as a bell shape.

The only different is that that range was shipped one in training set for the minimum value.

Possible FE: get rid of one in training set to make same data distribution.

By turning it into sum of number, it can be used for the first ML training and prediction and check with the result for further modification.

In [None]:
# convert the dataframe after FE into csv to save more time
new_train.to_csv('new_train.csv')
new_test.to_csv('new_test.csv')

#### Simple Binary Classification

##### Data Slicing

In [None]:
# seperate the training features and the predictor variable
X = new_train.drop('target', axis = 1)
y = new_train['target']

##### Logistic Regression

Using the logistic regression to perform a preliminary prediction.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# seperate the train and test set for model training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# build up a model with more iteration
model = LogisticRegression(max_iter = 10000)
# train the model and predict the splitted test set
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
# using confusion matrix to determine the precision & recall
confusion_matrix(y_test, pred)

By checking the confusion matrix, the model didn't perform well at all

In [None]:
from sklearn.metrics import precision_score, recall_score
# calculate the Precision and Recall
print(f'Precision: {precision_score(y_test, pred)}') # TP / (TP + FP)
print(f'Recall : {recall_score(y_test, pred)}') # TP / (TP + FN)

As the precision and recall is quite similar, let's work on the f1_score

In [None]:
from sklearn.metrics import f1_score
# calculate the f1_score
f1_score(y_test, pred)

Let's perform the predicton for the final test set and check the accuracy for the metrics and model performance in real instances.

In [None]:
pred_test = model.predict(new_test)

In [None]:
df_output = pd.DataFrame()
df_output['id'] = new_test.index
df_output['target'] = pred_test
df_output.to_csv('df_output_logistic_v1.csv',index = False)

After submisstion, the score was 0.62809 and it is quite close to the precision.

For next step, PCA & other model can be considered to improve the performance.