# Introduction
In this notebook, I tried to focus on finding the Best Machine Learning (ML) model for Breast Cancer Dataset.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from pandas import DataFrame
from sklearn.svm import SVC
# Set seed for reproducibility
SEED = 123

### Creating a test set and a training set

Since this data set is not ordered, we will to do a simple 70:30 split to create a training data set and a test data set.

In [None]:
data = pd.read_csv('/kaggle/input/cascadecup/train_age_dataset.csv')

In [None]:
data.head()

In [None]:
Y = data['age_group']
X = data.drop('age_group',1)

In [None]:
# Split dataset into 70% train, 30% test
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.1, random_state=SEED)

# Feature Scaling

Most of the times, our dataset will contain features highly varying in magnitudes, units and range. 
But since, most of the machine learning algorithms use Eucledian distance between two data points in their computations. 
We need to bring all features to the same level of magnitudes. This can be achieved by scaling. 
This means that you’re transforming your data so that it fits within a specific scale, like 0–100 or 0–1.

### Normalize the data

In [None]:
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [None]:
DataFrame(X_train_norm).describe()

From above, we can see that after normalizing the data all the columns have min and max values between 0 and 1 respectively.

### Standardize the data

In [None]:
# fit scaler on training data
stdscale = StandardScaler().fit(X_train)

# transform training data
X_train_std = stdscale.transform(X_train)

# transform testing dataabs
X_test_std = stdscale.transform(X_test)

In [None]:
DataFrame(X_train_std).describe()

From above, we can see that after standardizing the data all the columns have standard deviation of 1.

# Model Selection

In [None]:
# Instantiate individual classifiers
lr = LogisticRegression(max_iter = 500, n_jobs=-1, random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)
svc = SVC(kernel='rbf', probability = True, random_state=SEED)
rf = RandomForestClassifier(random_state=SEED)

# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', lr),
('K Nearest Neighbours', knn),
('SVM', svc),
('Random Forest Classifier', rf),
('Decision Tree', dt)]              

### Models prediction without any normalization or standardization

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

### Models prediction with Normalized data

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train_norm, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test_norm)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

### Models prediction with Standardized data

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train_std, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test_std)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, LSTM, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_test)
encoded_Y = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y1 = np_utils.to_categorical(y_test)

In [None]:
input_shape = [X.shape[1]]

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best', save_best_only=True, monitor='val_accuracy', mode='max')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')

In [None]:
model = Sequential()
model.add(BatchNormalization(input_shape=input_shape))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(4, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',callbacks=[earlyStopping, mcp_save, reduce_lr_loss], metrics=['accuracy'])

In [None]:
model.fit(X_train, dummy_y,validation_split = 0.1,batch_size=64,epochs=30)

In [None]:
y_pred = model.predict_classes(X_test)
y_pred = y_pred+1
# print accuracy
print("Accuracy: ", accuracy_score(y_test, y_pred))

# print precision, recall, F1-score per each class/tag
print(classification_report(y_test, y_pred))

# print confusion matrix, check documentation for sorting rows/columns
print(confusion_matrix(y_test, y_pred))

In [None]:
model = XGBClassifier()

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]

In [None]:
model.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

In [None]:
model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(  learning_rate=0.1,  
               colsample_bytree = 0.4,
                      subsample = 0.3,
                      n_estimators=200,
                      max_depth=5)

In [None]:
model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model =  XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 nthread=4, scale_pos_weight=1, seed=27)

In [None]:
model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
for n in range(3,10,2):
        model =  XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=n,
         min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
         nthread=4, scale_pos_weight=1, seed=27)
        model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
        # make predictions for test data
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        print("n",n,"Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
for n in range(1,6,2):
        model =  XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
         min_child_weight=n, gamma=0, subsample=0.8, colsample_bytree=0.8,
         nthread=4, scale_pos_weight=1, seed=27)
        model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
        # make predictions for test data
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        print("n",n,"Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
for n in range(100,400,50):
        model =  XGBClassifier( learning_rate =0.1, n_estimators=n, max_depth=9,
         min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
         nthread=4, scale_pos_weight=1, seed=27)
        model.fit(X_train,y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
        # make predictions for test data
        y_pred = model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        print("n",n,"Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(
 learning_rate =0.08,
 n_estimators=1500,
 max_depth=9,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
model.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model.fit(X, Y, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(
 learning_rate =0.06,
 n_estimators=1500,
 max_depth=10,
 min_child_weight=8,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
model.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model.fit(X, Y, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model = XGBClassifier(
 learning_rate =0.05,
 n_estimators=2000,
 max_depth=10,
 min_child_weight=8,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.01,
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
model.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
d = pd.read_csv('/kaggle/input/cascadecup/test_age_dataset.csv')
y_pred = model.predict(d)
submission = pd.read_csv('/kaggle/input/cascadecup/sample_submission.csv')
submission.prediction = y_pred
submission.to_csv('submission.csv', index=False)