In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import norm
from matplotlib import gridspec
import scipy.stats as stats
from scipy.stats import shapiro
!pip install feature_engine
import feature_engine.transformation as vt
from feature_engine.outliers import Winsorizer
from sklearn.model_selection import train_test_split
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import dataset
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

#### Take a general look of the data

In [None]:
df.head()

In [None]:
df.info()

#### Check for missing values

In [None]:
import missingno as msno
msno.matrix(df)

In [None]:
msno.bar(df)

In [None]:
# delete the "Unnamed: 32" column sine it contains no values
del df['Unnamed: 32']

### Univariate Data Analysis
<br>

##### Descriptive statistics of the whole dataset
<br>

In [None]:
df.drop('id',axis = 1).describe().T

##### Descriptive statistics of the "Benign" tumor
<br>

In [None]:
df[df['diagnosis'] == 'B'].describe().T

##### Descriptive statistics of the "Malignant" tumor  
<br>

In [None]:
df[df['diagnosis'] == 'M'].describe().T

#### Column : diagnosis
<br>

In [None]:
sns.set_theme(style="whitegrid")
fig,ax = plt.subplots(figsize = (6,6))
sns.countplot(x="diagnosis", data=df)
#annotatinos
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")

#### <li>About 37.2 % (212) of the total patients (569) are malignant tumor.

#### Check the distribution of the variables
<br>

In [None]:
def plotHistBox(col,hexColor = '#00aeff'):
    fig,ax = plt.subplots(ncols = 2,figsize = (15,4))
    gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1.7])
    ax0 = plt.subplot(gs[0])
    ax1 = plt.subplot(gs[1])
    #set figure title
    ax0.set_title('Histplot with KDE estimates')
    ax1.set_title('Boxplot')
    sns.histplot(df[col], kde=True,color = hexColor,ax = ax0)
    #sns.kdeplot(df[col], color="red")
    sns.boxplot(x=df[col],color = hexColor,ax = ax1)
    fig.suptitle(f'Column : {col}', fontsize=16)
    

In [None]:
plotHistBox('radius_mean')

In [None]:
for col in df.columns[3:]:
    plotHistBox(col)

#### Check the skewness and kurtosis of the columns
<br>

In [None]:
kurtSkewDict = {
    "Skewness" : df.skew()[1:].values,
    "Kurtosis" : df.kurt()[1:].values,
}


The Gaussian distibution has a Skewness of 0 and Kurtosis of 3<br>
Let's check the skewness and kurtosis of the variables

In [None]:
#create a dataFrame for the Skewness and Kurtosis
kurtSkewFrame = pd.DataFrame(data=kurtSkewDict,index = df.columns[2:])
kurtSkewFrame


### Bivariate Analysis
<br>

#### Check the distribution of the variables with respect to target 
<br>

In [None]:
def plotHistBoxBi(col):
    fig,ax = plt.subplots(ncols = 2,figsize = (15,4))
    gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1.5])
    ax0 = plt.subplot(gs[0])
    ax1 = plt.subplot(gs[1])
    #set figure title
    ax0.set_title('Histplots')
    ax1.set_title('Boxplots')
    sns.histplot(data = df,x = col,hue= 'diagnosis',ax = ax0)
    #sns.kdeplot(df[col], color="red")
    sns.boxplot(data =df,x=col,y = 'diagnosis',hue= 'diagnosis',ax = ax1)
    fig.suptitle(f'Column : {col}', fontsize=16)

In [None]:
plotHistBoxBi('radius_mean')

In [None]:
for col in df.columns[3:]:
    plotHistBoxBi(col)

#### Check the correlation
<br>

In [None]:
#create a class to return the top correlated features and also plot them
class correlationInfo():
    
    def __init__(self,col):
        self.col = col
        self.corrFrame = df[df.columns[2:]].corr()[self.col].sort_values(ascending = False)[1:].head(9)
        
    def corrVal(self):
        corrFrame = self.corrFrame.to_frame()
        corrFrame.columns = ['Correlation_values']
        print(f'Top nine features most correlatd to {col}')
        return corrFrame
    
    def correlationPlot(self):
        #grab the top nine most correlated attributes with the col
        corrCol = self.corrFrame.index
        fig,ax = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12))
        nrow = 0
        ncol = 0
        for cor_col in corrCol:
            sns.scatterplot(data=df, x=self.col, y=cor_col, hue="diagnosis", size="diagnosis",ax = ax[nrow,ncol])
            ncol += 1
            if ncol ==3:
                nrow +=1
                ncol =0
        fig.suptitle(f'Top 9 most correated features with {self.col}',size = 16)        
    
        

In [None]:
correlationInfo('radius_mean').corrVal()

In [None]:
correlationInfo('radius_mean').correlationPlot()

In [None]:
correlationInfo('symmetry_worst').corrVal()

In [None]:
correlationInfo('symmetry_worst').correlationPlot()

##### You can plot as check for as many features you want!!!
<br>

### Multivariat Data Analysis
<br>

In [None]:
#change the label of the target variables
df['diagnosis_e'] = df['diagnosis'].replace(['M'],1)
df['diagnosis_e'] = df['diagnosis_e'].replace(['B'],0)

In [None]:
print('The top nine features most correlated with Malignant tumor is ')
df.corr()['diagnosis_e'].sort_values(ascending = False)[1:].head(9)

In [None]:
print("Pair plot of the top features with most correlation with the target (part-1)")
sns.pairplot(df,vars = df.corr()
             ['diagnosis_e'].sort_values(ascending = False)[1:].head(4).index,
             hue="diagnosis",diag_kind="hist",height=3,markers=['v', '^'])

In [None]:
print("Pair plot of the top features with most correlation with the target (part-2)")
sns.pairplot(df,vars = df.corr()
             ['diagnosis_e'].sort_values(ascending = False)[1:][4:9].index,
             hue="diagnosis",diag_kind="hist",markers=['v', '^'])

#### Outlier Handling
<br>
* We cap the points outside the 1.5 * IQR range.

In [None]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            df.drop(['id', 'diagnosis','diagnosis_e'], axis=1),
            df['diagnosis_e'], test_size=0.3, random_state=69)

In [None]:
var_cols = df.drop(['id','diagnosis','diagnosis_e'],axis = 1).columns.to_list()

In [None]:
windsoriser = Winsorizer(capping_method='iqr', # choose iqr for IQR rule boundaries or gaussian for mean and std
                          tail='both', # cap left, right or both tails 
                          fold=1.5,
                          variables = var_cols)

windsoriser.fit(X_train)

In [None]:
#transform only the test set as it might lead to overfitting.
X_train = windsoriser.transform(X_train)

#### Scalling the data

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
X_train = rs.fit_transform(X_train)
X_test = rs.transform(X_test)

### Model Fitting
<br>

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from scipy.stats import loguniform
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
def plot_condution_metrics(y_test,predictions):
    #condusion metrics
    cm = metrics.confusion_matrix(y_test, predictions)
    score = np.mean([y_test == predictions])
    #plot
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=1, square = True,cbar = False);
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = 'TEST ACCURACY SCORE: {0}'.format(score)
    plt.title(all_sample_title, size = 15);

In [None]:
print("Training Shape",X_train.shape)
print("Testing Shape",X_test.shape)

#### Logistic Regression (Lasso)
<br>

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
parameters = {'penalty': ['l1'],"fit_intercept":[True,False],"C" :np.logspace(-4,4,16),'solver' : ['liblinear'],
              'max_iter' : [500,1000]
             }

In [None]:
gridsearch = GridSearchCV(LogisticRegression(), parameters)
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_params_

In [None]:
score  = cross_val_score(gridsearch, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gridsearch.score(X_train, y_train)}')

In [None]:
test_pred = gridsearch.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gridsearch.score(X_test, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance = {}
auc_socre = {}
model_performance['Logistic Regression(Lasso)'] = gridsearch.score(X_test, y_test)
auc_socre['Logistic Regression(Lasso)'] = auc

#### Logistic Regression (Ridge)
<br>

In [None]:
parameters = {'penalty': ['l2'],"fit_intercept":[True,False],"C" : np.logspace(-4,4,16),
              'solver' : ['newton-cg','liblinear'],
              'max_iter' : [100,500]
             }

In [None]:
gridsearch = GridSearchCV(LogisticRegression(), parameters)
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_params_

In [None]:
score  = cross_val_score(gridsearch, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gridsearch.score(X_train, y_train)}')


In [None]:
test_pred = gridsearch.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gridsearch.score(X_test, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['Logistic Regression(Ridge)'] = gridsearch.score(X_test, y_test)
auc_socre['Logistic Regression(Ridge)'] = auc

#### Polynomial Logistic Regression
<br>

In [None]:
from sklearn.preprocessing import PolynomialFeatures
parameters = {'penalty': ['l1','l2'],"fit_intercept":[True,False],"C" :np.logspace(-4,4,16),'solver' : ['liblinear'],
              'max_iter' : [500,1000]
             }

In [None]:
poly_features = PolynomialFeatures(degree = 2)

train_poly = poly_features.fit_transform(X_train)
test_poly = poly_features.transform(X_test)

gridsearch = GridSearchCV(LogisticRegression(), parameters)
gridsearch.fit(train_poly, y_train)

In [None]:
gridsearch.best_params_

In [None]:
score  = cross_val_score(gridsearch, train_poly, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gridsearch.score(train_poly, y_train)}')

In [None]:
test_pred = gridsearch.predict(test_poly)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gridsearch.score(test_poly, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['Polynomial Logistic Regression'] = gridsearch.score(test_poly, y_test)
auc_socre['Polynomial Logistic Regression'] = auc

#### KNN Classification
<br>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
parameters = {
    'n_neighbors' :np.arange(1,50),
    "weights" : ['uniform','distance'],
    "p" : [1,2],
    
}

In [None]:
gridsearch = GridSearchCV(KNeighborsClassifier(), parameters)
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_params_

In [None]:
score  = cross_val_score(gridsearch, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gridsearch.score(X_train, y_train)}')


In [None]:
test_pred = gridsearch.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gridsearch.score(X_test, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['KNN Classifier'] = gridsearch.score(X_test, y_test)
auc_socre['KNN Classifier'] = auc

#### Naive Bayes
<br>

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
test_pred = gnb.fit(X_train, y_train).predict(X_test)

In [None]:
score  = cross_val_score(gnb, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gnb.score(X_train, y_train)}')

In [None]:
test_pred = gnb.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gnb.score(X_test, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['Naive Bayes (gaussian)'] = gnb.score(X_test, y_test)
auc_socre['Naive Bayes (gaussian)'] = auc

#### Support Vector Machine
<br>

In [None]:
from sklearn.svm import SVC

In [None]:
gamma =list(np.logspace(-4,4,16))
gamma.append('scale')
param_grid = [
    {
        "C" :np.logspace(-4,4,16),
        "gamma" : gamma,
        "kernel" : ['rbf','linear']
        
    },
]

In [None]:
gridsearch = GridSearchCV(SVC(), param_grid)
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_params_

In [None]:
score  = cross_val_score(gridsearch, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f'The Training Score is {gridsearch.score(X_train, y_train)}')


In [None]:
test_pred = gridsearch.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
print(f"The score on the Test-dataset is {gridsearch.score(X_test, y_test)}")
print(f"The ROC_AUC score on the Test-dataset is {auc}")

In [None]:
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['Support Vector Machine'] = gridsearch.score(X_test, y_test)
auc_socre['Support Vector Machine'] = auc

#### Decision Trees
<br>

In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
clf_dt = DecisionTreeClassifier(random_state=42)
#build a preliminary tree
clf_dt.fit(X_train, y_train)

In [None]:
fig,ax = plt.subplots(figsize = (25,12))
ax = plot_tree(
    clf_dt,
    filled = True,
    rounded = True,
    class_names = ['Benign',"Malignant"],
    feature_names = var_cols
    
)

In [None]:
score  = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
print(f'The mean Cross-Valiation Score is {score.mean()}')
print(f"The score on the Train-dataset is {clf_dt.score(X_train, y_train)}")

##### As we can clearly see our tree overfits the data.
<br>

###### Cost-complexity prunning

In [None]:
path = clf_dt.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

cct_dts = []

for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = ccp_alpha)
    clf_dt.fit(X_train,y_train)
    cct_dts.append(clf_dt)

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in cct_dts]
test_scores = [clf.score(X_test, y_test) for clf in cct_dts]

fig, ax = plt.subplots(figsize = (15,6))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

Using cross-validation to find the optimal value of alpha
<br>

In [None]:
#looking at the figure and 'eye-balling' we see the alpha of 0.005 could be a bette value
# using K-fold CV
clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = 0.005)
scores = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
#plot 
df_cv = pd.DataFrame(data = {'tree' : range(9),'accuracy':scores})
df_cv.plot(x = 'tree',y = 'accuracy',marker = 'o',linestyle = '--')

In [None]:
alpha_loop_values = []

for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
    alpha_loop_values.append([ccp_alpha,np.mean(scores),np.std(scores)])
    
#storing in a pandas datframe
alpha_df = pd.DataFrame(alpha_loop_values,columns = ['alpha','mean_Score','std_score'])

#plot df
alpha_df.plot(x = 'alpha',y = 'mean_Score',marker = 'o',linestyle = '--')

In [None]:
print('alpha values with cv score > .90')
alpha_df[alpha_df['mean_Score'] > .9 ].sort_values(by = 'mean_Score',ascending = False)

In [None]:
ideal_alpha = 0.003350

In [None]:
clf_dt_prune = DecisionTreeClassifier(random_state=42,ccp_alpha = ideal_alpha)
clf_dt_prune.fit(X_train, y_train)

In [None]:
fig,ax = plt.subplots(figsize = (25,9))
ax = plot_tree(
    clf_dt_prune,
    filled = True,
    rounded = True,
    class_names = ['Benign',"Malignent"],
    feature_names = var_cols
    
)

In [None]:
print(f"The score on the Test-dataset is {clf_dt_prune.score(X_test, y_test)}")
print(f"The score on the Train-dataset is {clf_dt_prune.score(X_train, y_train)}")

In [None]:
test_pred = clf_dt_prune.predict(X_test)
auc = roc_auc_score(y_test, test_pred)
plot_condution_metrics(y_test,test_pred)

In [None]:
model_performance['Decision Tree'] = clf_dt_prune.score(X_test, y_test)
auc_socre['Decision Tree'] = auc

##### Visualizing the model
<br>

In [None]:
model_performance

In [None]:
auc_socre

In [None]:
acc_df = pd.DataFrame.from_dict(model_performance,orient = 'index',columns = ['Test Accuracy'])
auc_df = pd.DataFrame.from_dict(auc_socre,orient = 'index',columns = ['Test AUC'])
model_df = acc_df.join(auc_df)
model_df = model_df.sort_values(by ='Test AUC',ascending = False)
model_df

In [None]:
fig,ax = plt.subplots(figsize = (9,4.5))
plt.style.use('fivethirtyeight')
ax = sns.barplot(x="Test Accuracy", y=model_df.index, data=model_df,color = '#1ecfd9')
# Annotate every single Bar with its value, based on it's width           
for p in ax.patches:
    ax.annotate("%.4f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
            xytext=(5, 0), textcoords='offset points', ha="left", va="center")
sns.despine()    

In [None]:
fig,ax = plt.subplots(figsize = (9,4.5))
ax = sns.barplot(x="Test AUC", y=model_df.index, data=model_df,color = '#fc6423')
# Annotate every single Bar with its value, based on it's width           
for p in ax.patches:
    ax.annotate("%.4f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2),
            xytext=(5, 0), textcoords='offset points', ha="left", va="center")
sns.despine() 