In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Data Visualization

In [None]:
orig_train_data = pd.read_csv("data/Tabular Playground Series - May 2021/train.csv")
orig_test_data = pd.read_csv("data/Tabular Playground Series - May 2021/test.csv")

In [None]:
orig_train_data.head()

In [None]:
orig_test_data.head()

### Dropping ID Column

In [None]:
orig_train_data.drop("id", axis=1, inplace=True)

In [None]:
for i in orig_train_data.columns:
    df = pd.DataFrame(orig_train_data[i].value_counts())
    sns.barplot(x=df.index, y=df[i], data=df)
    plt.xlabel("Value")
    plt.ylabel("Count")
    plt.title(i)
    plt.show()

In [None]:
# sns.pairplot(orig_train_data)

In [None]:
orig_train_data.head()

### Review Data Distribution

In [None]:
class_1_data = orig_train_data[orig_train_data.target == "Class_1"]
class_1_data.shape

In [None]:
class_2_data = orig_train_data[orig_train_data.target == "Class_2"]
class_2_data.shape

In [None]:
class_3_data = orig_train_data[orig_train_data.target == "Class_3"]
class_3_data.shape

In [None]:
class_4_data = orig_train_data[orig_train_data.target == "Class_4"]
class_4_data.shape

In [None]:
type(class_1_data), type(class_2_data), type(class_3_data), type(class_4_data)

### Balance the Data

In [None]:
class_2_data = class_2_data.iloc[:8490, :]
class_2_data.shape

In [None]:
class_3_data = class_3_data.iloc[:8490, :]
class_3_data.shape

In [None]:
class_4_data = class_4_data.iloc[:8490, :]
class_4_data.shape

In [None]:
type(class_1_data), type(class_2_data), type(class_3_data), type(class_4_data)

### Combine all Class 1 to 4 Dataset

In [None]:
from sklearn.utils import shuffle

balanced_data = [class_1_data, class_2_data, class_3_data, class_4_data]
balanced_data = pd.concat(balanced_data)
balanced_data = shuffle(balanced_data).reset_index(drop=True)
balanced_data.head()

### Remove Duplicates

In [None]:
balanced_data = balanced_data[balanced_data.duplicated() == False]
balanced_data.head(5)

In [None]:
target = balanced_data["target"]

In [None]:
balanced_data.drop("target", axis=1, inplace=True)

In [None]:
balanced_data.head()

### Check for Correlated Features

In [None]:
corr = orig_train_data.corr()
f, ax = plt.subplots(figsize=(11, 9))
mask = np.triu(np.ones_like(corr, dtype=np.bool))
sns.set_style(style="white")
cmap = sns.diverging_palette(10, 250, as_cmap=True)
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
    ax=ax,
)

### Apply StandardScalar

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
balanced_data = pd.DataFrame(
    scaler.fit_transform(balanced_data), columns=balanced_data.columns
)
balanced_data

### Prepare and Process Test Data

In [None]:
orig_test_data.head()

In [None]:
orig_test_data_id = orig_test_data.id
orig_test_data.drop("id", axis=1, inplace=True)
orig_test_data_id.head()

In [None]:
orig_test_data.head()

### Scale Test Data

In [None]:
orig_test_data = pd.DataFrame(
    scaler.fit_transform(orig_test_data), columns=orig_test_data.columns
)
orig_test_data.head(5)

### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

sgd_pipeline = make_pipeline(SGDClassifier(loss="log"))

sgd_pipeline.fit(X=balanced_data, y=target)

In [None]:
sgd_output = pd.DataFrame(
    sgd_pipeline.predict_proba(balanced_data), columns=sgd_pipeline.classes_
)
sgd_output.head()

### Predicting with SGD Classifier

In [None]:
sgd_output = pd.DataFrame(
    sgd_pipeline.predict_proba(orig_test_data), columns=sgd_pipeline.classes_
)

In [None]:
sgd_output.to_csv("data/Tabular Playground Series - May 2021/sgd_output_submission.csv")

### OneVsRestClassifier

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

ovr_pipeline = make_pipeline(OneVsRestClassifier(SVC(probability=True)))
ovr_pipeline.fit(balanced_data, target)

In [None]:
pd.DataFrame(
    ovr_pipeline.predict_proba(balanced_data), columns=ovr_pipeline.classes_
).head()

### Predicting with OneVsRestClassifier

In [None]:
output = pd.DataFrame(
    ovr_pipeline.predict_proba(orig_test_data), columns=ovr_pipeline.classes_
)

In [None]:
output.to_csv("data/Tabular Playground Series - May 2021/ovr_output_submission.csv")

### Split data in Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    balanced_data, target, test_size=0.2, random_state=42
)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

### DecisionTreeClassifier

In [None]:
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_predict,
    cross_val_score,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    "max_features": [10, 20, 40],
    "max_depth": [5, 10, 20],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [5, 10, 20],
}

dt_clf = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=param_grid,
    scoring="accuracy",
    cv=kfold,
    n_jobs=4,
    refit=True,
)

# ovr_clf = OneVsRestClassifier(dt_clf)
# ovr_pipeline = make_pipeline(StandardScaler(), OneVsRestClassifier(dt_clf))

dt_clf.fit(x_train, y_train)

# Best score
print(dt_clf.best_score_)

# Best Estimator
print(dt_clf.best_estimator_)

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_predict,
    cross_val_score,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {
    "max_features": [10, 20, 40],
    "max_depth": [5, 10, 20],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [5, 10, 20],
}

rnd_clf = GridSearchCV(
    RandomForestClassifier(),
    param_grid=param_grid,
    scoring="accuracy",
    cv=kfold,
    n_jobs=4,
)

rnd_clf.fit(x_train, y_train)

# Best score
print(rnd_clf.best_score_)

# Best Estimator
print(rnd_clf.best_estimator_)

### ANN

In [None]:
from keras.layers import BatchNormalization, Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import (
    KFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
def encode_target(data):
    label_encoder = LabelEncoder()
    encoded_target = label_encoder.fit_transform(data)
    one_hot_encoder = OneHotEncoder(sparse=False)
    encoded_target = encoded_target.reshape(len(encoded_target), 1)
    ohe_target = one_hot_encoder.fit_transform(encoded_target)
    print(ohe_target)
    return ohe_target

#### LabelEncode the Target Classes and OneHotEncode Target Classes

In [None]:
ohe_target = encode_target(target)

In [None]:
def ann_model():
    model = Sequential()
    model.add(
        Dense(
            balanced_data.shape[1], activation="relu", input_dim=balanced_data.shape[1]
        )
    )
    model.add(Dropout(5))
    model.add(BatchNormalization())
    model.add(Dense(164, activation="relu"))
    model.add(Dropout(5))
    model.add(BatchNormalization())
    model.add(Dense(4, activation="softmax"))

    model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    model.summary()
    return model

In [None]:
ann_model()

In [None]:
balanced_data.shape, ohe_target.shape

In [None]:
estimator = KerasClassifier(build_fn=ann_model, epochs=10, batch_size=5, verbose=1)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, balanced_data, ohe_target, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

### Calculate Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif

importance = mutual_info_classif(x_train, y_train)
feature_importance = pd.DataFrame(importance, x_train.columns[0 : len(x_train.columns)])

In [None]:
feature_importance[feature_importance.values > 0.000000]

In [None]:
from sklearn.feature_selection import SelectKBest

selectkbest = SelectKBest(mutual_info_classif, k=10).fit(x_train, y_train)
x_train.columns[selectkbest.get_support()]

In [None]:
x_train_ig = selectkbest.transform(x_train)
x_test_ig = selectkbest.transform(x_test)

x_train_ig.shape, x_test_ig.shape

In [None]:
y_train_ig = encode_target(y_train)

In [None]:
y_train_ig.shape

### Retrain ANN with New Data

In [None]:
def ann_model():
    model = Sequential()
    model.add(
        Dense(x_train_ig.shape[1], activation="relu", input_dim=x_train_ig.shape[1])
    )
    model.add(Dropout(5))
    model.add(BatchNormalization())
    model.add(Dense(164, activation="relu"))
    model.add(Dropout(5))
    model.add(BatchNormalization())
    model.add(Dense(4, activation="softmax"))

    model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    model.summary()
    return model

In [None]:
ann_model()

In [None]:
estimator = KerasClassifier(build_fn=ann_model, epochs=20, batch_size=1, verbose=1)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, x_train_ig, y_train_ig, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

In [None]:
Baseline: 26.82% (0.88%)