## COMPETITION LINK 
https://www.kaggle.com/c/tabular-playground-series-nov-2021/overview
### My kaggle account https://www.kaggle.com/jokkojja

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.svm import LinearSVC # faster then SVC
import multiprocessing
import warnings
warnings.filterwarnings("ignore")

In [None]:
MAX_CORE_COUNT = multiprocessing.cpu_count()

# DATA EXPROLATION

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

In [None]:
print(f'train shape -  {train.shape}')
train.head()

We have train dataset with 102 rows and 600000 columns

In [None]:
train.describe().T

Huge std, need to scale data.

In [None]:
train.isna().sum().unique() # not nan

Not nan values in train

In [None]:
train.target.unique() # binary classification

binary classification

In [None]:
sns.countplot(train.target) 
plt.title('Count of target types')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

Train is balanced dataset

In [None]:
train.drop(['id', 'target'], axis = 1).hist(figsize = (30, 30)) 
plt.show()

Bimodal distribution and many outliers

In [None]:
plt.figure(figsize = (20, 12))
sns.heatmap(train.corr(), annot = False) 

Not medium or strong correlation

# BASELINE

In [None]:
X = train.drop(['id', 'target'], axis = 1)
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# sc = dabl.SimpleClassifier().fit(X_train, y_train)
# sc.score(X_test, y_test)
# sc.log_
simple_clf = DecisionTreeClassifier()
baseline = np.mean(cross_val_score(simple_clf, X_train, y_train, scoring = 'roc_auc', n_jobs = MAX_CORE_COUNT))
#score of simple classifier without tuning

In [None]:
print(f'roc auc score baseline - {baseline}')

Make a baseline prediction. Use simple model without tuning. Score - mean roc auc score 0.5517265092369377

# MODELING

## RandomForestClassifier

In [None]:
rndf_clf = RandomForestClassifier()
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', rndf_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', rndf_clf)])

I decided to use pipelines with 2 type of scalers. Min max scaler and standart scaler

In [None]:
rndf_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline
}
random_forest_score_list = {}

In [None]:
def model_cross_val_score(my_pipeline_dict):
    """Function for calculating mean cross validation roc auc score for my pipelines"""
    for key, pipe in my_pipeline_dict.items():
        cv_scores = np.mean(cross_val_score(pipe,X_train, y_train, scoring = 'roc_auc', n_jobs = MAX_CORE_COUNT))
        print(f"mean roc auc score: {cv_scores}, for scaler {key}")

In [None]:
model_cross_val_score(rndf_clf_pipeline_dict) 
# StandardScaler, MinMaxScaler
# would like 4 scalers(StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer)
# but training is so long

## GradientBoostingClassifier

In [None]:
grad_boost_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, loss = 'deviance',\
                                           criterion = 'mse')
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', grad_boost_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', grad_boost_clf)])

In [None]:
grad_boost_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline
}
grad_boost_score_list = {}

In [None]:
#bad score
model_cross_val_score(grad_boost_clf_pipeline_dict) 
# StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

## KNeighborsClassifier

In [None]:
k_neigh_clf = KNeighborsClassifier()
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', k_neigh_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', k_neigh_clf)])

In [None]:
k_neigh_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline
}
k_neigh_score_list = {}

In [None]:
# model_cross_val_score(k_neigh_clf_pipeline_dict) 
# I decided dont use this model, its very slow

## LogisticRegression

In [None]:
log_reg_clf = LogisticRegression()
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', log_reg_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', log_reg_clf)])

In [None]:
log_reg_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline
}
log_reg_clf_score_list = {}

In [None]:
model_cross_val_score(log_reg_clf_pipeline_dict) 
#best score for standart scaler

## GaussianNB 

In [None]:
gauss_clf = GaussianNB()
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', gauss_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', gauss_clf)])

In [None]:
gauss_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline    
}
gauss_clf_score_list = {}

In [None]:
model_cross_val_score(gauss_clf_pipeline_dict) 
# so bad

## SVC

In [None]:
svc_clf = LinearSVC(max_iter=4000)
standart_pipeline = Pipeline([('standart_scaler', StandardScaler()), ('model', svc_clf)])
minmax_pipeline = Pipeline([('min_max_scaler', MinMaxScaler()), ('model', svc_clf)])

In [None]:
svc_clf_pipeline_dict = {
    'standart' : standart_pipeline,
    'min_max' : minmax_pipeline    
}
svc_clf_score_list = {}

In [None]:
model_cross_val_score(svc_clf_pipeline_dict) 

Logistic regression with standart scaler is the best. KNeighborsClassifier is very slow, I'll comment the cell

# TUNING

Need to tune hyperparametrs for best model to get the best score.

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = LogisticRegression()
params = {'max_iter' : range(1000, 12000, 1000), 'C' : np.arange(0.2, 2, 0.2), \
         'n_jobs' : [MAX_CORE_COUNT]}

In [None]:
grid = GridSearchCV(model, params, cv = 5, scoring='roc_auc', verbose = 3)

In [None]:
grid.fit(X_train, y_train)
#tried to tune solver but got an error

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_model = grid.best_estimator_

# Prediction and make submission

In [None]:
not_need_feature = ['id', 'target']
features = [f for f in train.columns if f not in not_need_feature]

In [None]:
x_pred = test[features].values
x_pred = scaler.transform(x_pred)

In [None]:
test['target'] = best_model.predict(x_pred)

In [None]:
test[['id', 'target']].to_csv('tab_comp_log_reg_pred.csv', index = False)