In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as s
from sklearn import model_selection
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import pickle

In [None]:
data = pd.read_csv('data.csv')

In [None]:
df = data.drop('Unnamed: 32', axis=1)

In [None]:
df.diagnosis = df.diagnosis.astype('category')

In [None]:
X = df.drop(labels='diagnosis', axis=1)
Y = df['diagnosis']

X = X.drop("id", axis=1)

In [None]:
df_norm = (X - X.mean()) / (X.max() - X.min())
# df_norm = pd.concat([df_norm, Y], axis=1)

In [None]:
df_norm.columns

In [None]:
X_norm = df_norm
Y_norm = Y


le = LabelEncoder()
le.fit(Y_norm)

In [None]:
Y_norm = le.transform(Y_norm)

In [None]:
Y_norm = pd.DataFrame(Y_norm)
Y_norm.tail()

In [None]:
def FitModel(X, Y, algo_name, algorithm, gridSearchParams, cv):
    np.random.seed(10)
    x_train, x_test, y_train, y_test = train_test_split(X.values, Y.values, test_size=.2)
    
    grid = GridSearchCV(
        estimator=algorithm,
        param_grid=gridSearchParams,
        cv=cv, scoring='accuracy', verbose=1, n_jobs=-1
    )
    
    grid_result = grid.fit(x_train, y_train)
    best_params = grid_result.best_params_
    pred = grid_result.predict(x_test)
    cm = confusion_matrix(y_test, pred)
    
    pickle.dump(grid_result, open(algo_name, 'wb'))
    
    
    print('Best params:', best_params)
    print('Classification Report:', classification_report(y_test, pred))
    print('Accuracy Score:' + str(accuracy_score(y_test, pred)))
    print('Confusion Matrix: \n', cm)

In [None]:
param = {
           'C': [0.1, 1, 100, 1000],
           'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        }

FitModel(X_norm, Y_norm, 'SVC', SVC(), param, cv=5)

In [None]:
param = {
    'n_estimators': [100, 500, 1000, 2000],
}
FitModel(X, Y, 'Random Forest', RandomForestClassifier(), param, cv=10)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(df_norm, Y_norm)
print(type(X_res), type(Y_res))

In [None]:
param = {
    'n_estimators': [100, 500, 1000, 2000]
}
FitModel(X_res, Y_res, 'XGBoost', XGBClassifier(), param, cv=10)

In [3]:
loaded_model = pickle.load(open('demo/XGBoost', 'rb'))

In [6]:
pred = loaded_model.predict(np.array([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]]))

In [7]:
pred

array([1, 1])