In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Downloading the data

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
train_data.shape

We have 60000 instance with 100 features

### Exploring the correlation in the data

In [None]:
corr = (train_data.drop(['target','id'], axis=1)).corrwith(train_data["target"])

plt.figure(figsize=(50,10))
plt.bar(x=corr.index, height=corr)
plt.show()

In [None]:
sorted_corr = corr.sort_values()

plt.figure(figsize=(50,10))
plt.bar(x=sorted_corr.index, height=sorted_corr)
plt.axhline(y=0.03, color='red', linestyle='--')
plt.axhline(y=-0.03, color='red', linestyle='--')
plt.show()


From the data correlation, we found that there are very symmetrical correlation in both side

We will define a threshold to drop the data that correlation is lower than 0.3

In [None]:
corr_df = pd.DataFrame(corr)
threshold = 0.03

filter_data = corr_df.loc[abs(corr_df[0]) > threshold]

In [None]:
filter_id = list(filter_data.index)

In [None]:
# Let's create a function that return filtered features column
def filter_index(corr, thresholds):
    corr_df = pd.DataFrame(corr)

    filter_data = corr_df.loc[abs(corr_df[0]) > thresholds]
    filter_id = list(filter_data.index)
    
    return filter_id

# Let's process the training data.
We will try using the whole dataset without applying the threshold first

In [None]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['id', 'target'], axis=1)
y = train_data['target']
y = y.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [None]:
X_train.shape

In [None]:
y_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit_transform(X_train)


### Let's build some model.


Try using logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='sag')
log_reg.fit(X_train, y_train)

In [None]:
X_test_scale = scaler.fit_transform(X_test)

In [None]:
y_hat_lr = log_reg.predict(X_test_scale)

In [None]:
# Getting probability for each prediction
y_score_lr = log_reg.predict_proba(X_test_scale)[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt

def showing_score(y_true, y_predic):
       
    print("The confusion matrix")
    print(confusion_matrix(y_true, y_predict))
    print()
    print("Accuracy score: ", accuracy_score(y_true, y_predict))
    
def plotting_roc(y_true, y_score):
    
    fpr, tpr, thresholds = roc_curve(y_true ,y_score, pos_label=1)
    
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr ,label="ROC line (AUC = %.2f)" % roc_auc)
    plt.plot([0,1], color='red', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.xlim(0, 1.05)
    plt.ylabel("True Positive Rate")
    plt.ylim(0, 1.05)
    plt.legend(loc="lower right")
    plt.title("ROC curve")
    
    


In [None]:
plotting_roc(y_test, y_score_lr)

### This unfiltered logistic regression model give AUC = 0.74

Let's try to filter the data using the function written above

In [None]:
filter_id = filter_index(corr, thresholds=0.01)

In [None]:
X_filter = X.loc[:, filter_id]

In [None]:
X_filter.shape

We use thresholds = 0.01 and as a result

We filtered the data down to 77 features

In [None]:
X_filter_train, X_filter_test, y_filter_train, y_filter_test = train_test_split(X_filter, y, random_state=2, test_size=0.2)

In [None]:
X_train_fil_scal = scaler.fit_transform(X_filter_train)
X_test_fil_scal = scaler.fit_transform(X_filter_test)

In [None]:
log_reg_fil = LogisticRegression(solver='sag')

log_reg_fil.fit(X_train_fil_scal, y_filter_train)

In [None]:
y_score_train = log_reg_fil.predict_proba(X_train_fil_scal)[:, 1]

plotting_roc(y_filter_train, y_score_train)

In [None]:
y_score_filter = log_reg_fil.predict_proba(X_test_fil_scal)[:, 1]

In [None]:
y_score_filter

In [None]:
plotting_roc(y_filter_test, y_score_filter)

Below is my attempt trying to change the thresholds value


In [None]:
from sklearn.base import clone

def filter_plot(X, y, filter):
    X_filter = X.loc[:, filter]
    
    X_filter_train, X_filter_test, y_filter_train, y_filter_test = train_test_split(X_filter, y, random_state=2, test_size=0.2)
    
    X_train_fil_scal = scaler.fit_transform(X_filter_train)
    X_test_fil_scal = scaler.fit_transform(X_filter_test)
    
    log_reg_fil = LogisticRegression(solver='sag')

    log_reg_fil.fit(X_train_fil_scal, y_filter_train)
    
    y_score_filter = log_reg_fil.predict_proba(X_test_fil_scal)[:, 1]
    
    plotting_roc(y_filter_test, y_score_filter)
    plt.show()
    
    return clone(log_reg_fil)

In [None]:
filter_id_005 = filter_index(corr, thresholds=0.05)

In [None]:
filter_plot(X, y, filter_id_005)

We got worse result from using the threshold as 0.72

In [None]:
filter_id_001 = filter_index(corr, thresholds=0.01)
print(len(filter_id_001))
log_reg_filter_001 = filter_plot(X, y, filter_id_001)

We got better result as we using the threshold of 0.01

In [None]:
# Making a prediction on the test.csv data

filter_ind = filter_index(corr, thresholds=0.01)

test_data = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")


In [None]:
test_data['id']

In [None]:
test_data.head()
X_test = test_data.loc[:, filter_ind]
X_test.shape


In [None]:
scaler.fit_transform(X_test)


In [None]:
y_hat = log_reg_fil.predict_proba(X_test)

In [None]:
pd.DataFrame(y_hat)

In [None]:
y_score_test = y_hat[:, 1]

submission_csv = pd.DataFrame({'id':test_data['id'], 
                               'target':y_score_test,}, columns=['id', 'target'])

submission_csv.set_index('id', inplace=True)

In [None]:
submission_csv.head()

In [None]:
# saving to csv file
submission_csv.to_csv("./submission.csv")


### Here is the sample submission.

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

In [None]:
sample.head()

In [None]:
1.470031e-01

In [None]:
9.941410e-01