In [None]:
%load_ext nb_black

In [None]:
# d = {"a": 1, "b": 2, "c": 5}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

%matplotlib inline

import seaborn as sns

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# from sklearn.grid_search import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

###
from catboost import CatBoostClassifier
from lightgbm import LGBMRegressor, LGBMClassifier, Booster
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from datetime import datetime
import warnings

warnings.simplefilter("ignore")

In [None]:
RANDOM_SEED = 2021
PROBAS = True
FOLDS = 5
N_ESTIMATORS = 1000

In [None]:
drug_data = pd.read_csv("drug200.csv")
drug_data.head()

In [None]:
X = drug_data.drop(["Drug"], axis=1)
y = drug_data["Drug"]

print(f"X:{X.shape} y: {y.shape} \n")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_SEED
)
print(f"X_train:{X_train.shape} y_train: {y_train.shape}")
print(f"X_test:{X_test.shape} y_test: {y_test.shape}")

In [None]:
# test = drug_data[len(drug_data):].drop(["Drug"], axis = 1)
# print (f'test:{test.shape}')

In [None]:
drug_data

In [None]:
X_train

In [None]:
lgb_params = {
    "metric": "binary_logloss",
    "n_estimators": 10000,
    "objective": "binary",
    "learning_rate": 0.02,
    "min_child_samples": 150,
    "reg_alpha": 3e-5,
    "reg_lambda": 9e-2,
    "num_leaves": 20,
    "max_depth": 16,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 2,
    "max_bin": 240,
    "device": "gpu",
}

cb_params = {
    "max_depth": 6,
    "max_ctr_complexity": 5,
    "num_trees": 50000,
    "od_wait": 500,
    "od_type": "Iter",
    "learning_rate": 0.04,
    "min_data_in_leaf": 3,
    "task_type": "GPU",
}


rf_params = {"max_depth": 15, "min_samples_leaf": 8, "random_state": RANDOM_SEED}

In [None]:
cl1 = KNeighborsClassifier(n_neighbors=1)
cl2 = RandomForestClassifier(**rf_params)
cl3 = GaussianNB()
cl4 = DecisionTreeClassifier(max_depth=5)
cl5 = CatBoostClassifier(task_type="GPU", verbose=None, logging_level="Silent")
cl6 = LGBMClassifier(device="gpu")

# I used some hyperparameter search (ExtraTrees - Genetic search)
cl7 = ExtraTreesClassifier(
    bootstrap=False,
    criterion="entropy",
    max_features=0.55,
    min_samples_leaf=8,
    min_samples_split=4,
    n_estimators=100,
)  # Optimized using TPOT
cl8 = MLPClassifier(
    activation="relu",
    alpha=0.1,
    hidden_layer_sizes=(10, 10, 10),
    learning_rate="constant",
    max_iter=2000,
    random_state=RANDOM_SEED,
)

In [None]:
classifiers = {
    "RandomForest": cl2,
    "DecisionTree": cl4,
    "CatBoost": cl5,
    "LGBM": cl6,
    "ExtraTrees": cl7,
}

In [None]:
mlr = LogisticRegression()

In [None]:
models_scores_results, models_names = list(), list() 


In [None]:
print(">>>> Training started <<<<")
for key in classifiers:
    classifier = classifiers[key]
    scores = model_selection.cross_val_score(
        classifier, X_train, y_train, cv=FOLDS, scoring="accuracy"
    )
    models_scores_results.append(scores)
    models_names.append(key)
    print("[%s] - accuracy: %0.5f " % (key, scores.mean()))
    classifier.fit(X_train, y_train)

    # Save classifier for prediction
    classifiers[key] = classifier

In [None]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))


In [None]:
X

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier

rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]
for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X, y, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(
        clf, max_samples=0.4, max_features=10, random_state=42
    )
    bagging_scores = cross_val_score(bagging_clf, X, y, cv=10, n_jobs=-1)

    print(
        "Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(
            clf.__class__.__name__, vanilla_scores.mean(), vanilla_scores.std()
        )
    )
    print(
        "Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(
            clf.__class__.__name__, bagging_scores.mean(), bagging_scores.std()
        )
    )

In [None]:
from sklearn.ensemble import VotingClassifier

clf = [rf, et, knn, svc, rg]
eclf = VotingClassifier(
    estimators=[
        ("Random Forests", rf),
        ("Extra Trees", et),
        ("KNeighbors", knn),
        ("SVC", svc),
        ("Ridge Classifier", rg),
    ],
    voting="hard",
)
for clf, label in zip(
    [rf, et, knn, svc, rg, eclf],
    [
        "Random Forest",
        "Extra Trees",
        "KNeighbors",
        "SVC",
        "Ridge Classifier",
        "Ensemble",
    ],
):
    scores = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

## Ensemble Learning

### Max Voting / Voting Classifier

In [None]:
# Ensemble of Models
estimator = []
estimator.append(
    ("LR", LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=200))
)
estimator.append(("SVC", SVC(gamma="auto", probability=True)))
estimator.append(("DTC", DecisionTreeClassifier()))

In [None]:
# Voting Classifier with hard voting
hard_voting = VotingClassifier(estimators=estimator, voting="hard")
hard_voting.fit(X_train, y_train)
y_pred = hard_voting.predict(X_test)