In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

**Loading Dataset**

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv')
train.head(5)

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv')
test.head(2)

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv')
sub.head(2)

**Shape of Dataset**

In [None]:
train.shape

**Looking on some stastical data**

In [None]:
train.describe()

**Summary of DataFrame**

In [None]:
train.info()

**Checking Missing Values**

In [None]:
train.isnull().sum()

**Handling Categorical fetures**

In [None]:
train['target'] = train['target'].map({'Class_1':1,'Class_2':2,'Class_3':3,'Class_4':4})

In [None]:
train.head(2)

**Drop Unnecessary Column**

In [None]:
train.drop(['id'],axis=1, inplace=True)
test.drop(['id'],axis=1, inplace=True)

**Checking Distribution of Dataset**

In [None]:
# let's see how data is distributed for every column
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in train:
    if plotnumber<= 52:
        ax = plt.subplot(8,7,plotnumber)
        sns.distplot(train[column])
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.tight_layout()

**Separting Dependent and Independent column**

In [None]:
X = train.drop(['target'], axis=1)
y = train['target']

In [None]:
X.head(2)

**Handling Imbalanced Data**

In [None]:
y.value_counts()

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [None]:
# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,y)

In [None]:
y_res.value_counts()

In [None]:
X_ros.drop(['feature_19','feature_30','feature_31','feature_32','feature_35','feature_38','feature_39','feature_42'], axis=1, inplace=True)

In [None]:
test.drop(['feature_19','feature_30','feature_31','feature_32','feature_35','feature_38','feature_39','feature_42'], axis=1, inplace=True)

In [None]:
X_res.shape

In [None]:
y.shape

**RandomOverSampler to handle imbalanced data**

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
os =  RandomOverSampler()
X_ros, y_ros = os.fit_resample(X, y)

In [None]:
y_ros.value_counts()

**Under Sampling**

In [None]:
from imblearn.under_sampling import NearMiss
ns=NearMiss()
X_ns,y_ns=ns.fit_resample(X,y)

In [None]:
y_ns.value_counts()

**Some Other Feature Engineering**

In [None]:
X_res.head(2)

In [None]:
col = X_res.columns

In [None]:
X_res.nunique()

**Feature Selection Using ExtraTreesClassifier**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
ranked_features.sort_values(ascending=False)

In [None]:
temp = []
for i in ranked_features.index:
  if ranked_features[i] > 0.02:
    temp.append(i)

In [None]:
df = X[temp]
df.head(2)

In [None]:
df_test = test[temp]
df_test.head(2)

**Feature Selection using mutual_info_classif**

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mutual_info=mutual_info_classif(X,y)

In [None]:
mutual_data=pd.Series(mutual_info,index=X.columns)
mutual_data.sort_values(ascending=False)

In [None]:
temp = []
for i in mutual_data.index:
  if mutual_data[i] != 0:
    temp.append(i)

In [None]:
df = X[temp]

In [None]:
df.head(2)

**Applying PCA**

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
principalComponents = pca.fit_transform(X_ros)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()

In [None]:
pca = PCA(n_components=30)
new_data = pca.fit_transform(X_ros)
# This will be the new data fed to the algorithm.
principal_Df = pd.DataFrame(data = new_data)

In [None]:
principal_Df.head(2)

In [None]:
test_data = pca.transform(test)
# This will be the new data fed to the algorithm.
principal_Df_test = pd.DataFrame(data = test_data)

In [None]:
principal_Df_test.head()

**Splitting Dataset into Train and Validation set**

In [None]:
### Only for ANN training
Y = pd.get_dummies(y)
Y.shape

In [None]:
# Create Train & Test Data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(principal_Df, y_ros, test_size=0.2, random_state=1)  ### Change y with "Y" while ANN training

**Standard Scaling**

In [None]:
# Standard Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

**Robust Scaling**

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

**Quantile Transformer**

In [None]:
from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
test_scaled = scaler.transform(principal_Df_test)

**Importing all Classification Model**

In [None]:
!pip install catboost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [None]:
classifiers = [['DecisionTree :',DecisionTreeClassifier()],
               ['RandomForest :',RandomForestClassifier()], 
               ['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
#                ['SVM :', SVC()],
#                ['Neural Network :', MLPClassifier()],
               ['LogisticRegression :', LogisticRegression()],
               ['ExtraTreesClassifier :', ExtraTreesClassifier()],
               ['AdaBoostClassifier :', AdaBoostClassifier()],
               ['GradientBoostingClassifier: ', GradientBoostingClassifier()],
               ['XGB :', XGBClassifier()],
               ['LGBM :',LGBMClassifier(objective='multiclass', random_state=5)],
               ['Easy :',EasyEnsembleClassifier()],
               ['CatBoost :', CatBoostClassifier(logging_level='Silent')]]

predictions_df = pd.DataFrame()
predictions_df['actual_labels'] = y_val

for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train_scaled, y_train)
    predictions = classifier.predict(X_val_scaled)
    predictions_df[name.strip(" :")] = predictions
    print(name, accuracy_score(y_val, predictions))

In [None]:
ETC = XGBClassifier(tree_method = 'gpu_hist')
ETC.fit(X_train_scaled, y_train)
predictions = ETC.predict(X_val_scaled)
print("Accuracy :", accuracy_score(y_val, predictions))
print("Confusion Matrix :", confusion_matrix(y_val, predictions))
print("Classification :", classification_report(y_val, predictions))

In [None]:
from sklearn.metrics import log_loss
y_pred = sclf.predict_proba(X_val_scaled)
log_loss(y_val, y_pred)

In [None]:
y_pred = sclf.predict_proba(test_scaled)

**Hyperparameter Tuning**

In [None]:
grid = {'max_depth': [3,4,5,7,9],'n_estimators':[100, 200, 300,400, 500],'learning_rate':[0.001,0.01,0.1]}

In [None]:
gscv = GridSearchCV (estimator = Cat, param_grid = grid, scoring ='accuracy', cv = 5)
gscv.fit(X_train_scaled, y_train)

In [None]:
print(gscv.best_params_)

In [None]:
tuned_model = CatBoostClassifier(learning_rate= 0.1, max_depth= 5, n_estimators= 300, task_type = "GPU",verbose=True)
tuned_model.fit(X_train_scaled, y_train)
predictions = tuned_model.predict(X_val_scaled)
accuracy_score(y_val, predictions)

In [None]:
ADB = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
XGB = XGBClassifier()
# ADB.fit(X_train_scaled, y_train)
# predictions = ADB.predict(X_val_scaled)
# accuracy_score(y_val, predictions)

**Stacking**

In [None]:
# stacking
from mlxtend.classifier import StackingClassifier
XGB = XGBClassifier(tree_method = 'gpu_hist')
RFC = RandomForestClassifier()
ETC = ExtraTreesClassifier()

In [None]:
sclf=StackingClassifier(classifiers=[RFC,ETC], use_probas=True, meta_classifier=XGB)
sclf.fit(X_train_scaled, y_train)
predictions = sclf.predict(X_val_scaled)
accuracy_score(y_val, predictions)

**Bagging**

In [None]:
from sklearn.ensemble import BaggingClassifier
Cat = CatBoostClassifier(verbose=False, task_type = "GPU")

In [None]:
bag_xgb = BaggingClassifier(Cat,
                            n_estimators=200, max_samples=0.5,
                            bootstrap=True, random_state=0,oob_score=True, n_jobs=-1)
bag_xgb.fit(X_train_scaled, y_train)
predictions = bag_xgb.predict(X_val_scaled)
accuracy_score(y_val, predictions)

In [None]:
from sklearn.metrics import log_loss
y_pred = bag_xgb.predict_proba(X_val_scaled)
log_loss(y_val, y_pred)

In [None]:
y_pred = bag_xgb.predict_proba(test_scaled)

In [None]:
submission_cat = pd.DataFrame(y_pred, columns=['Class_1','Class_2','Class_3','Class_4'])
submission_cat['id'] = sub['id']

In [None]:
submission_cat.to_csv('./result.csv', index=None)

**Model training using ANN**

In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

In [None]:
X_train_scaled.shape

In [None]:
y_train.shape

In [None]:
y_val.shape

In [None]:
model = keras.models.Sequential([ 
    keras.layers.Dense(activation="relu", input_dim=50, units=32, kernel_initializer="uniform"),
    keras.layers.Dense(activation="relu", units=64, kernel_initializer="uniform"),    
    keras.layers.BatchNormalization(),
    keras.layers.Dense(activation="relu", units=128, kernel_initializer="uniform"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(activation="relu", units=256, kernel_initializer="uniform"),
    keras.layers.Dense(activation="softmax", units=4, kernel_initializer="uniform")
])

In [None]:
model.summary()

In [None]:
epochs = 50
opt = Adam()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
history = model.fit(X_train_scaled, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_scaled,y_val))

In [None]:
# summarizing historical accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
y_pred = model.predict(test)

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
sub[['Class_1','Class_2','Class_3','Class_4']] = y_pred
sub.head(10)

In [None]:
sub.to_csv('./result.csv', index=None)