In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let me clone my custom ml-framework

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Import Sklearn libraries
from sklearn import preprocessing, model_selection, feature_selection

# to make this notebook's output stable across runs
np.random.seed(2210)

# Set some options
pd.set_option("display.max_colwidth", 100)
sns.set_theme(style="whitegrid", palette="Set3")

# Set Matplotlib defaults
%matplotlib inline

## Load and explore the data

In [None]:
input_file = "/kaggle/input/breast-cancer-wisconsin-data/data.csv"
df = pd.read_csv(input_file)
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df_min_max = pd.concat(
    [df.describe().loc["min"], df.describe().loc["mean"], df.describe().loc["max"]],
    axis=1,
)

df_min_max.drop(df.columns[-1], inplace=True)
color = "#b753e6"
sns.reset_defaults()
sns.reset_orig()
sns.set_context(
    rc={
        "lines.linewidth": 5,
        "axes.labelsize": 15,
    }
)

fig, axs = plt.subplots(ncols=3, figsize=(15, 5))
fig.suptitle("Min-Max distribution")
for i, col in enumerate(df_min_max.columns.to_list()):
    sns.histplot(df_min_max[col], bins=5, ax=axs[i], color="#b753e6", alpha=0.8)
plt.show()

In [None]:
_ = df.hist(figsize=(20, 15), color="#b753e6", alpha=0.8)

In [None]:
df['diagnosis'].value_counts()

## Observations
1. All columns have value preset and of type Float except Unnamed:32
2. diaognisis class is the one we have to predict
3. All numerical values are positive, we can apply Box-Cox transformer to unskew the data
4. Unnamed:32 column is not required and can be dropped
5. Id column can be dropped

In [None]:
! rm -rf ml_framework
! git clone https://github.com/maindolaamit/ml_framework.git

In [None]:
import sys
lib_dir = os.path.join('/kaggle/working', 'ml_framework')

sys.path.append(lib_dir)

from mllib import helper, charts, ml, metrics, features
# from ml_framework import mllib

In [None]:
def get_data():
    df = pd.read_csv(input_file)
    df.drop(["id", "Unnamed: 32"], axis=1, inplace=True)
    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    # Return the dataframe
    le.fit(df["diagnosis"])
    df["diagnosis"] = le.transform(df["diagnosis"])

    # return dataframe
    return df, le

df, le = get_data()
X, y = df.drop("diagnosis", axis=1), df["diagnosis"].values
columns = X.columns.to_list()

X.shape, y.shape

### Try various transformations and view the performance of classifiers

In [None]:
encoding_list = features.NumericalFeatures.get_encoders_list()
cv = model_selection.StratifiedKFold(n_splits=5, shuffle=False)
# Split the data based on Stratified
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, stratify=y)

def eval_clf_with_transformations(X, y, columns, encoding_list):
    clf_val_df = pd.DataFrame()
    clf_test_df = pd.DataFrame()

    # Loop for each encoding and save the results
    for encoding in encoding_list:
        encodings_features = {encoding: (columns, "mode")}
        num = features.NumericalFeatures(X, encodings_features)
        try:
            df_encoded = num.fit_transform()
            df_val_score, df_test_score, search_grids = ml.evaluate_classifiers(
                X_train, y_train, X_test, y_test, is_binary=True, cv=cv, sort_by="f1-score"
            )
            if clf_val_df is None:
                df_val_score["encoder"] = encoding
                df_test_score["encoder"] = encoding
                clf_val_df = df_val_score
                clf_test_df = df_test_score
            else:
                df_val_score["encoder"] = encoding
                df_test_score["encoder"] = encoding
                clf_val_df = pd.concat([clf_val_df, df_val_score])
                clf_test_df = pd.concat([clf_test_df, df_test_score])
        except ValueError as ve:
            # Skip box-cox as data may not be positive in some cases
            if encoding == 'box-cox':
                continue
    return clf_val_df, clf_test_df


clf_val_df, clf_test_df = eval_clf_with_transformations(X, y, X.columns.to_list(), encoding_list)

### Lets view the Validation and test performance with each transformation

In [None]:
clf_val_df.sort_values(by='mean_val_score', ascending=False)[:5]

In [None]:
clf_test_df.sort_values(by='f1-score', ascending=False)

### XGBoost, LightGBM gave the best performances on the Test results
<li> We will use mix-max scalar as transformation
<li> Let's select only the limited features from the data and check on them

#### View the information gain of each column

In [None]:
from sklearn.feature_selection import mutual_info_classif

info_gain = pd.Series(mutual_info_classif(X, y), index=X.columns)
info_gain.sort_values(ascending=False)[:5]

#### Select best 20 columns with Min-Max Scalar

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

select_best = SelectKBest(mutual_info_classif, 20)
select_best.fit(X, y)
best_columns = X.columns[select_best.get_support()].to_list()
X_best = X[best_columns]

# Split the data based on Stratified
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_best, y, test_size=0.2, stratify=y)
clf_val_df, clf_test_df = eval_clf_with_transformations(X, y, best_columns, ['min-max'])

In [None]:
best_score_df = pd.Series(select_best.scores_, index=X.columns.to_list())
best_score_df.sort_values(ascending=False, inplace=True)
# Plot for feature importance
plt.figure(figsize=(20, 8))
plt.style.use('fivethirtyeight')
sns.set_style("white")
sns.barplot(x=best_score_df.index[:10], y=best_score_df[0:10], palette='muted')
plt.title(f'Importance for the Top 10 Features (Gini criterion) ',
          fontweight='bold')
plt.grid(True, alpha=0.1, color='black')
plt.show()

In [None]:
clf_test_df

#### Use PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=15)

X_pca = pca.fit_transform(X_best)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_pca, y, test_size=0.2, stratify=y)
df_val_score, df_test_score, search_grids = ml.evaluate_classifiers(
    X_train, y_train, X_test, y_test, is_binary=True, cv=cv, sort_by="f1-score"
)

In [None]:
clf_test_df

## Fine Tuning 
Let's use best K-best columns RandomForest, Adaboost and XGBoost

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_best, y, test_size=0.2, stratify=y)
xgb_clf = ml.fine_tune_classifier('xgb', X_train, y_train, cv=cv, randomized=True )

In [None]:
ada_clf = ml.fine_tune_classifier('ada', X_train, y_train, cv=cv, randomized=True )

In [None]:
rf_clf = ml.fine_tune_classifier('rf', X_train, y_train, cv=cv, randomized=False )

### Final Prediction
Lets view the confusion matrix and classification score with models trained

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

for model in [rf_clf, ada_clf, xgb_clf]:
    y_pred = model.predict(X_test)
    cnf = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    
    charts.plot_confusion_matrix(cnf, le.classes_, f'Confusion Matrix - {type(model).__name__}')
    print(report)

### AdaBoost and RandomForest gives 98% accuracy
Let's take AdaBoost as final classifier for prediction

In [None]:
final_model = ada_clf
y_pred = final_model.predict(X_test)
cnf = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)
    
charts.plot_confusion_matrix(cnf, le.classes_, f'Confusion Matrix - {type(final_model).__name__}')
print(report)