In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import skew, norm 
import plotly.express as px
from warnings import filterwarnings as filt

filt('ignore')
plt.rcParams['figure.figsize'] = (12,6)
plt.style.use('seaborn-darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df = df.drop(['id','Unnamed: 32'], axis = 1)
df.head()

In [None]:
df.shape

In [None]:
df.isnull().values.sum()

In [None]:
sns.countplot(df.diagnosis)

In [None]:
df.nunique()

In [None]:
df['diagnosis'] = df.diagnosis.apply(lambda x : 0 if x == 'B' else 1)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp
import shap
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.feature_selection import mutual_info_classif

In [None]:
def permImp(x, y):
    model = rfc().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return eli5.show_weights(perm , feature_names = x.columns.tolist())

def isolate(x, y, col):
    model = rfc().fit(x, y)
    pdp_dist = pdp.pdp_isolate(model, dataset = x, model_features = x.columns, feature = col)
    return pdp.pdp_plot(pdp_dist, feature_name = col)

def forceplot(x, y, n_class = 0):
    model = rfc().fit(x, y)
    explainer = shap.TreeExplainer(model)
    shap_value = explainer.shap_values[n_class]
    expected_value = explainer.expected_value[n_class]
    return shap.force_plot(expected_value, shap_value, feature_names = x.columns)

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    return plt.barh(score.index, score.mi_score)

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features = False), index = x.columns, columns = ['mi_score'])
    plot_mi(score)
    return score.sort_values('mi_score', ascending = False)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

In [None]:
x = df.drop(['diagnosis'], axis = 1)
y = df.diagnosis
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123, stratify = y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
permImp(x_train, y_train)

In [None]:
sns.heatmap(df.corr())

In [None]:
df.corrwith(df.diagnosis).sort_values(ascending = False).head(10)

In [None]:
sns.scatterplot(data = df, x = 'concave points_worst', y = 'texture_worst', hue = 'diagnosis')

In [None]:
isolate(x_train, y_train, 'concave points_worst')

In [None]:
x_train.describe()

In [None]:
mscore = mi_score(x_train, y_train)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRFClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate, KFold

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler , RobustScaler, MinMaxScaler

In [None]:
def best_model(x, y):
    xgb.set_config(verbosity=0)
    models = [LogisticRegression(), SVC(), KNeighborsClassifier(), GaussianNB(), rfc(), XGBRFClassifier(), LGBMClassifier()]
    names = ['logistic regg','svm', 'knn', 'naive bayes', 'random forest', 'xgboost', 'lightgb']
    scores = [[] for _ in range(4)]
    for model in models:
        for idx, scaler in enumerate([None, StandardScaler(), RobustScaler(), MinMaxScaler()]):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            #cv = StratifiedKFold(5, shuffle = True, random_state = 123)
            cv = KFold(5, shuffle = True, random_state = 123)
            score = cross_validate(model, X = x, y = y, cv = cv, scoring = 'f1')['test_score'].mean()
            scores[idx].append(score)
    return pd.DataFrame(scores, columns = names, index = ['None', 'std', 'robust', 'minmax']).T

def clf_report(yt, pred):
    print()
    print(classification_report(yt,  pred))
    print()
    
def get_score(xt, yt, xtest, ytest, model, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Report '.center(60,'='))
    print()
    print(f"training score  :===>  {model.score(xt, yt)}")
    print(f"testing score   :===>  {model.score(xtest, ytest)}")
    clf_report(ytest, pred)
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True)
    
    
def gridcv(xt, yt, model, params, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    cv = KFold(5, shuffle = True, random_state = 123)
    clf = GridSearchCV(model, param_grid = params, cv = cv, scoring = 'f1', return_train_score = True, verbose = 1)
    clf.fit(xt, yt)
    res = pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending = False)
    return clf.best_estimator_, clf.best_params_, res[['mean_train_score','mean_test_score','params']]

In [None]:
best_model(x_train, y_train)

In [None]:
params = {
    'model__C' : [1, 10, 50, 100, 500, 1000],
    'model__solver' : ['lbfgs', 'liblinear'],
    'model__class_weight' : [None, 'balanced'],
    'model__max_iter' : [100,1000]
}
clf, best_param, results = gridcv(x_train, y_train, LogisticRegression(), params, scaler = StandardScaler())

In [None]:
results.head()

In [None]:
sns.lineplot(results.index, results.mean_train_score, color = 'blue')
sns.lineplot(results.index, results.mean_test_score, color = 'red')
plt.legend(['mean_train_score', 'mean_test_score'])

In [None]:
best_param

In [None]:
get_score(x_train, y_train, x_test, y_test, LogisticRegression(C = 0.1), scaler = StandardScaler())

In [None]:
skewness = pd.DataFrame(np.abs(skew(df)), columns = ['skew_score'], index = df.columns).sort_values('skew_score', ascending = True)
plt.barh(skewness.index, skewness.skew_score)
plt.title('skewness score')

In [None]:
high_skewness = skewness[skewness.skew_score > 2].index
fig, ax = plt.subplots(len(high_skewness), 2, figsize = (16, 10))
fig.tight_layout()
for ind,col in enumerate(high_skewness):
    sns.distplot(df[col], ax = ax[ind, 0])
    sns.boxplot(df[col], ax = ax[ind, 1])

* there are lot of features with extreme outliers, even the outliers are important for this dataset so we'll keep them
* hence final F1-score is 98% 