In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_squared_log_error, classification_report, balanced_accuracy_score
from sklearn.metrics import log_loss

from sklearn.utils import class_weight
import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering, KMeans

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE

import optuna
from optuna.samplers import TPESampler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
train.head()

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# counts each type of Class
#sorted(train['target'].value_counts())
train['target'].value_counts()

In [None]:
lencoder = LabelEncoder()
target = pd.DataFrame(lencoder.fit_transform(train['target']),columns=['target'])

train.drop(['target'], inplace=True, axis=1)

In [None]:
sns.countplot(x = 'target', data = target)

In [None]:
pca = PCA(n_components = 16)
pca_data = pca.fit_transform(train)

# percentage variation 
per_var = np.round(pca.explained_variance_ratio_*100, decimals = 1)
labels = ['PC' + str(x) for x in range (1, len(per_var)+1)]

# plot the percentage of explained variance by principal component
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label = labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

# plot pca
pca_df = pd.DataFrame(pca_data, columns = labels)
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('PCA')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
pca_df.head()

In [None]:
km = KMeans(
        init ="random",
        n_clusters = 5,
        n_init = 10,
        max_iter = 300,
        random_state = 4241
        )

km_pca_df = km.fit_transform(pca_df)
km_pca_df = pd.DataFrame(km_pca_df, columns = ['k1','k2','k3','k4','k5'] )
km_pca_df.head()

#pca_df_val = pca.transform(X_val)
#km_pca_df_val = km.transform(pca_df_val)
#km_pca_df_val = pd.DataFrame(km_pca_df_val, columns = ['k1','k2','k3','k4','k5'])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(km_pca_df, target, test_size = 0.33, 
                                                  stratify = target, random_state = 2021)

In [None]:
rf_model = RandomForestClassifier(n_estimators = 50, 
                                  criterion = 'entropy') 
                                
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict_proba(X_val)
rf_pred[:10]

In [None]:
rf_preds = np.argmax(rf_pred, axis=1)
print(f'MSE Score: {mean_squared_error(y_val,rf_preds)}\n') 
print(classification_report(y_val, rf_preds))

sns.heatmap(pd.DataFrame(confusion_matrix(y_val, rf_preds)), annot=True, linewidths=.5, fmt="d")

In [None]:
sns.countplot(x = 'target', data= pd.DataFrame(rf_preds, columns=['target']))

In [None]:
params = {'criterion' : 'entropy', 'max_features' : 'log2'}

In [None]:
def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators',30,50)
    max_depth = trial.suggest_int('max_depth',3,100)
    
    #num_leaves = trial.suggest_int('num_leaves',10,30)
    #learning_rate = trial.suggest_uniform('learning_rate',0.01,0.2)
    #subsample = trial.suggest_uniform('subsample',0.5, 0.9)
    #feature_fraction = trial.suggest_uniform('feature fraction',0.5, 0.9)
    #min_child_samples = trial.suggest_int('min_child_samples', 1, 110),
    #min_child_weight = trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
    #lambda_l2 = trial.suggest_uniform('lambda_l2',1e-5,20)   
    
    model = RandomForestClassifier(**params,
            n_estimators = n_estimators,
            max_depth = max_depth
            #num_leaves = num_leaves,
            #learning_rate = learning_rate,
            #subsample = subsample,
            #feature_fraction = feature_fraction,
            #min_child_samples = min_child_samples,
            #min_child_weight = min_child_weight,
            #lambda_l2 = lambda_l2
            )
    
    nll = cross_val_score(model,X_train,y_train,scoring = 'neg_log_loss', cv = 5).mean()
    return -1*nll

In [None]:
sampler = TPESampler(seed=1111)
study = optuna.create_study(direction = 'minimize', sampler = sampler)
study.optimize(objective,n_trials = 1)
print('numbers of the finished trials:' , len(study.trials))
print(study.best_value)
print(study.best_params)

In [None]:
rf = RandomForestClassifier(**params, 
                    n_estimators = 30,
                    max_depth = 26
                    #num_leaves = 22,
                    #learning_rate = 0.026798877915977834,
                    #subsample = 0.6615232298649514,
                    #feature_fraction = 0.5881079099486431,
                    #min_child_samples = 27,
                    #min_child_weight = 0.04781667419116532,
                    #lambda_l2 = 10.543869110101163
                    )

rf.fit(X_train,y_train)
preds = rf.predict(X_val)

print('Classification report:\n')
print(classification_report(y_val,preds))
sns.heatmap(pd.DataFrame(confusion_matrix(y_val, preds)), annot=True, linewidths=.5, fmt="d")

In [None]:
sns.countplot(x = 'target', data= pd.DataFrame(preds, columns=['target']))

In [None]:
# Apply same changes to test

pca_test = pca.fit_transform(test)
pca_test = pd.DataFrame(pca_test, columns = labels)

km_pca_test = km.fit_transform(pca_test)
km_pca_test = pd.DataFrame(km_pca_test, columns = ['k1','k2','k3','k4','k5'] )
km_pca_test.head()


In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4']] = rf.predict_proba(km_pca_test.values)

sample_submission.to_csv("my_submissionOPT.csv",index = False)
sample_submission.head()