In [None]:
# Author: Pierre Jeanne
# Date Created:  31 May 2021

# Tabular Playground Series - Jun 2021
Each row in the dataset has been labeled with one true `Class`. For each row, you must submit the predicted probabilities that the product belongs to each class label. 



In [None]:
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

# scale the data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import power_transform

# classification model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
# hyperparameter tunning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import log_loss

<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">1: Description of the data
</p>
</div>

In [None]:
# load the data
df_train = pd.read_csv(r'../input/tabular-playground-series-jun-2021/train.csv')
df_train = df_train.drop(columns=['id'],axis=1)
print(df_train.shape)
df_train.head(3)

In [None]:
df_test = pd.read_csv(r'../input/tabular-playground-series-jun-2021/test.csv')
df_test = df_test.drop(columns=['id'],axis=1)
print(df_test.shape)

In [None]:
# check missing value
print('there is {} missing value in the training set'.format(sum(df_train.isnull().sum())))
print('there is {} missing value in the testing set'.format(sum(df_test.isnull().sum())))

In [None]:
# check dtypes
df_train.info()

Only integers, execpted the target.

In [None]:
# check distribution: tests the null hypothesis that a sample comes from a normal distribution.
from scipy import stats
for col in df_train.columns[:-1]:
#     col = df_train[col].values
    k2, p = stats.normaltest(df_train[col])
    alpha = 1e-3
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        print("{} is normally distributed".format(col))
    else:
        print("{} is not normally distributed".format(col))

In [None]:
df_train.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
# check target variables
sns.countplot(x='target',data=df_train)
plt.show()

The data are imbalanced.

<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">2: Data preparation
</p>
</div>

## 2.1: Is there any duplicates?

In [None]:
# delete duplicate columns if any
print(df_train.shape)
df_train = df_train.loc[:,~df_train.columns.duplicated()]
print(df_train.shape)

In [None]:
# delete duplicate columns if any
print(df_train.shape)
df_train = df_train.drop_duplicates()
print(df_train.shape)


## 2.2: Is there any outliers?
remove values higher than three standard deviations.

In [None]:
target = df_train['target']
df_train2 = df_train.drop('target', axis=1)

In [None]:
# relace outliers by nan values
v = df_train2.values
mask = np.abs((v - v.mean(0)) / v.std(0)) > 5
df_train2 = pd.DataFrame(np.where(mask, np.nan, v), df_train2.index,df_train2.columns)


with 5 standard deviation : 1.84414

with 3 standard deviation : 1.85511

In [None]:
df_train3 = pd.concat([df_train2,target],axis =1)
df_train3 = df_train3.dropna()
df_train3.shape

In [None]:
def stat_df(df): 
    df['sum'] = df.sum(axis=1)
    df['mean'] = df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['q01'] = df.quantile(q=0.01, axis=1)
    df['q05'] = df.quantile(q=0.05, axis=1)
    df['q10'] = df.quantile(q=0.1, axis=1)
    df['q20'] = df.quantile(q=0.2, axis=1)
    df['q30'] = df.quantile(q=0.3, axis=1)
    df['q40'] = df.quantile(q=0.4, axis=1)
    df['q50'] = df.quantile(q=0.5, axis=1)
    df['q60'] = df.quantile(q=0.6, axis=1)
    df['q70'] = df.quantile(q=0.7, axis=1)
    df['q80'] = df.quantile(q=0.8, axis=1)
    df['q90'] = df.quantile(q=0.9, axis=1)
    df['q95'] = df.quantile(q=0.95, axis=1)
    df['q99'] = df.quantile(q=0.99, axis=1)
    df['skew'] = df.skew(axis = 1)
    df['kurtosis'] = df.kurtosis(axis = 1)
    return df

In [None]:
df_train3.shape

In [None]:
df_train3 = stat_df(df_train3)

In [None]:
df_train3.shape

In [None]:
# df_train3 = pd.concat([df_train3,target],axis =1)
# df_train3.head(5)

In [None]:
from imblearn.over_sampling import SMOTE
X = df_train3.drop('target',axis=1)
y = df_train3[['target']].values.ravel()
# Using SMOTE to Balance the imbalanced data 
df_train_resampled, target_resampled = SMOTE().fit_resample(X, y)

# plot result
df_train_resampled = pd.DataFrame(df_train_resampled, columns=X.columns ) 
df_target_resampled = pd.DataFrame(target_resampled,columns = ['target'])
# check target variables
sns.countplot(x='target',data=df_target_resampled)
plt.show()

<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">2: Machine learning
</p>
</div>

### 2.1: merge test and train data

In [None]:
ntrain = df_train_resampled.shape[0]
ntest = df_test.shape[0]
all_data = pd.concat((df_train_resampled, df_test)).reset_index(drop=True)
print("all_data size is : {}".format(all_data.shape))

### 2.2: scale data

In [None]:
# use different scaler on all the data
scaler = StandardScaler()
all_data_stand = scaler.fit_transform(all_data)

# max_abs_scaler = MaxAbsScaler()
# all_data_MaxAbsScaler = max_abs_scaler.fit_transform(all_data)

# min_max_scaler = MinMaxScaler()
# all_data_MinMaxScaler = min_max_scaler.fit_transform(all_data)

# Robust = RobustScaler()
# all_data_Robust = Robust.fit_transform(all_data)

# all_data_power = power_transform(all_data)


### 2.3: split the df with all data into the training and testing set

In [None]:
from sklearn.model_selection import train_test_split
# 
df_train_scaler = all_data_stand[:ntrain]
df_test_scaler = all_data_stand[ntrain:]
# # 
# df_train_MaxAbsScaler = all_data_MaxAbsScaler[:ntrain]
# df_test_MaxAbsScaler = all_data_MaxAbsScaler[ntrain:]
# # 
# df_train_MinMaxScaler = all_data_MinMaxScaler[:ntrain]
# df_test_MinMaxScaler = all_data_MinMaxScaler[ntrain:]
# 
# df_train_Robust = all_data_Robust[:ntrain]
# df_test_Robust = all_data_Robust[ntrain:]
# 
# df_train_power = all_data_power[:ntrain]
# df_test_power = all_data_power[ntrain:]

In [None]:
print(df_train_power.shape)
print(df_test_power.shape)

In [None]:
X_train_scaler, X_test_scaler, y_train_scaler, y_test_scaler = \
train_test_split(df_train_scaler,target_resampled,test_size = 0.1, random_state=0)

# X_train_MaxAbsScaler, X_test_MaxAbsScaler, y_train_MaxAbsScaler, y_test_MaxAbsScaler =\
# train_test_split(df_train_MaxAbsScaler, target_resampled,test_size = 0.2,random_state=0)

# X_train_MinMaxScaler, X_test_MinMaxScaler, y_train_MinMaxScaler, y_test_MinMaxScaler =\
# train_test_split(df_train_MinMaxScaler, target_resampled,test_size = 0.2,random_state=0)

# X_train_Robust, X_test_Robust, y_train_Robust, y_test_Robust =\
# train_test_split(df_train_Robust, target_resampled,test_size = 0.2,random_state=0)

# X_train_power, X_test_power, y_train_power, y_test_power =\
# train_test_split(df_train_power, target_resampled,test_size = 0.1,random_state=0)

In [None]:
list_score = []
list_model = []
list_scaler = []

In [None]:
from sklearn.ensemble import RandomForestClassifier

# clf = RandomForestClassifier(n_estimators=25)
# clf.fit(X_train_scaler, y_train_scaler)
# clf_probs = clf.predict_proba(X_test_scaler)

In [None]:
# # Setup the parameters and distributions to sample from: param_dist
# parameters = {'bootstrap': [False],
#  'max_depth': [30,40,50,60,70,90],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 3],
#  'min_samples_split': [2, 3,4,5],
#  'n_estimators': [50]}

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = parameters, n_iter = 50, cv = 3, verbose=10, random_state=42,
#                                n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_train_Robust, y_train_Robust)

In [None]:
# rf_random.best_params_

In [None]:
clf = RandomForestClassifier(n_estimators=220, min_samples_split = 2, min_samples_leaf = 1, 
                             max_features = 'sqrt', max_depth = 60, bootstrap = False)
clf.fit(X_train_scaler, y_train_scaler)
clf_probs = clf.predict_proba(X_test_scaler)

In [None]:
score = log_loss(y_test_scaler, clf_probs)

print("Log-loss of")
print(f" * uncalibrated classifier: {score:.3f}")

### Prediction on df_test

In [None]:
df_test2 = stat_df(df_test)

In [None]:
X = scaler.transform(df_test)
# X = power_transform(df_test2)
clf_probs = clf.predict_proba(X)
clf_probs = pd.DataFrame(clf_probs,columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
clf_probs

In [None]:
index = pd.DataFrame()
index['id'] = 200000 + clf_probs.index

In [None]:
final_test= pd.concat([index,clf_probs],axis=1)
# final_test = final_test.set_index('id')
final_test.to_csv('sub8.csv',index=False)

final_test.head()

In [None]:
clf_probs[0]