In [None]:
#this cell is valid when you're working on google colab and you want to upload the data to colab environment to use in your notebook
#uploading the data file from your Desktop
#from google.colab import files
#files.upload()

## Loading all the relevant libraries

In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
pd.set_option('display.max_columns',40)

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
import plotly.figure_factory as ff


In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

## Data Overview

In [None]:
#Loading the dataset in Pandas dataframe
df_cancer = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df_cancer.head()

In [None]:
print(df_cancer.columns)
print()
print("Cancer dataset dimensions : {}".format(df_cancer.shape))
print()
print("Rows:",df_cancer.shape[0])
print()
print("Columns:",df_cancer.shape[1])

There is one column in the end which is random so we will drop this column

In [None]:
df_cancer = df_cancer.drop('Unnamed: 32',axis=1)

In [None]:
print(df_cancer.columns)
df_cancer.head()

It is always a good practice to see some **stats(mean,median,percentiles)** of all the variables involve, and pandas has a describe() functions especially for this purpose.
We are doing Transpose of the describe() output since we have almost 30 columns to see.
This can also be used to see outliers without using any plot.

In [None]:
df_cancer.describe().T

### checking for any missing value in data, if there are any missing value we will be doing missing value imputation

In [None]:
print(df_cancer.isnull().any().any())

And there are no missing records in the given data.
Well done.
Let's move ahead.

## Exploratory Data Analysis(EDA)

Let's see what we have in our dependent variable(**diagnosis**). Here we are using graph objects of plotly library.

In [None]:
configure_plotly_browser_state()
trace = go.Pie(labels = ['benign','malignant'], values = df_cancer['diagnosis'].value_counts(), 
               textfont=dict(size=10), opacity = 0.7,
               marker=dict(colors=['green', 'red'], 
               line=dict(color='#000000', width=1.0)))
           

layout= go.Layout(
        title={
        'text': "Distribution of dependent(diagnosis) variable",
        'y':0.8,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig = go.Figure(data = [trace], layout=layout)
fig.show()

number of benign classes are much more than malignant

Mapping the categories of dependent variable to 1 and 0. We will be predicting whether the record is malignant cancer or not so it makes sense to tag malignant as 1

In [None]:
df_cancer['diagnosis']= df_cancer['diagnosis'].map({'M':1,'B':0})
df_cancer.head()

In [None]:
df_cancer['diagnosis'].value_counts()

In [None]:
mal = df_cancer[(df_cancer['diagnosis'] != 0)]
print(mal.shape)
ben = df_cancer[(df_cancer['diagnosis'] == 0)]
print(ben.shape)
def show_plots(column, bin_size) :  
    t1 = mal[column]
    t2 = ben[column]
    
    hist_data = [t1, t2]
    
    group_labels = ['Malignant', 'Benign']
    colors = ['red', 'green']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = bin_size, curve_type='kde')
    
    fig['layout'].update(title = column)
    fig.show()

In [None]:
configure_plotly_browser_state()
show_plots('radius_mean', .3)
show_plots('texture_mean', .3)
show_plots('perimeter_mean',3)
show_plots('area_mean',20)


In [None]:
configure_plotly_browser_state()
show_plots('radius_se', 0.1)
show_plots('texture_se', .1)
show_plots('perimeter_se', .5)
show_plots('area_se', 5)


In [None]:
configure_plotly_browser_state()
show_plots('radius_worst', .5)
show_plots('texture_worst', .5)
show_plots('perimeter_worst', 5)
show_plots('area_worst', 15)


heatmap is a good visualtization plot to see the corrleation among vaiables and there is not point of feeding highly correlated variables into any ML model because we are not providing any extra information through that variables and we are adding a complexity to any ML model by adding 1 variable. We want out model to as generic and simpleas possible.

In [None]:
plt.figure(figsize=(25,12))
sns.heatmap(df_cancer.corr(),annot=True)

In [None]:
sns.scatterplot(x='area_mean',y='smoothness_mean',hue='diagnosis',data=df_cancer)

## Model Building

### Model1. Baseline Model
We will build a model by using all the variables present in our model using random forest classifier. This will be our baseline model which we will try to beat by using feature selection and also by changing the classifier to xgboost.

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']
len(features)

#### Splitting the data into train and val to build the model on train and validate it on val data.

In [None]:
X =df_cancer[features].values
y =df_cancer['diagnosis']
print(X.shape)
print(y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3,random_state=22,stratify=y)
print("Shape of train dataset:")
print(X_train.shape)
print(y_train.shape)
print("\n")
print("Shape of val dataset:")
print(X_val.shape)
print(y_val.shape)
print("\n")

#### Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier(max_depth=1, random_state=0, verbose=0,n_estimators=50)
model1.fit(X_train,y_train)

In [None]:
y_pred1 = model1.predict(X_val)

#### Model Evaluation

In [None]:
cnf1 = confusion_matrix(y_val,y_pred1)
sns.heatmap(cnf1,annot=True,cmap='summer',fmt='g')

In [None]:
acc1 = accuracy_score(y_val,y_pred1)
print("Accuracy: for baseline model is: %0.3f"%acc1)

print("RF train accuracy: %0.3f" % model1.score(X_train, y_train))
print("RF test accuracy: %0.3f" % model1.score(X_val, y_val))

In [None]:
print(classification_report(y_val,y_pred1))

In [None]:
coef1= model1.feature_importances_
print(coef1.shape)
print(len(features))
coefs1 = pd.DataFrame({"Features":features,"Coefficients":coef1})
feature_imp1 = coefs1.sort_values(by='Coefficients',ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(y='Features',x='Coefficients',data=feature_imp1)

### Model2. Random Forest using Parameter Tuning
We will use grid search to tune the parametrs of Random Forest

In [None]:
param_grid={'n_estimators':[50,100,150,200,250],
            'max_depth':[1,2,3,4],
            'min_samples_split':[2,3,5],
            'max_features':['auto','sqrt','log2']}

In [None]:
model2= GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=0,n_jobs=-1)
model2.fit(X_train,y_train)

In [None]:
print(model2.best_params_)
y_pred2 = model2.predict(X_val)

In [None]:
cnf2 = confusion_matrix(y_val,y_pred2)
sns.heatmap(cnf2,annot=True,fmt='g',cmap='Blues')

In [None]:
acc2 = accuracy_score(y_val,y_pred2)
print("Accuracy with GridSearch: %0.3f"%acc2)

print("RF train accuracy: %0.3f" % model2.score(X_train, y_train))
print("RF test accuracy: %0.3f" % model2.score(X_val, y_val))

In [None]:
print(classification_report(y_val,y_pred2))

In [None]:
coef2= model2.best_estimator_.feature_importances_
print(coef2.shape)
print(len(features))
coefs2 = pd.DataFrame({"Features":features,"Coefficients":coef2})
feature_imp2 = coefs2.sort_values(by='Coefficients',ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(y='Features',x='Coefficients',data=feature_imp2)

***`We were able to increase our performance of 94% on val data to 98.8% on val data just by introducing paramter tuning using Grid Search CV.`***

### Model3.  Using Feature Selection
Adding the feature selection before feeding all the variables to any model.

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
model3 = Pipeline([
  ('feature_selection', SelectFromModel(ExtraTreesClassifier(n_estimators=50))),
  ('classification', RandomForestClassifier())
])
model3.fit(X_train, y_train)

In [None]:
y_pred3 = model3.predict(X_val)
cnf3 = confusion_matrix(y_val,y_pred3)
sns.heatmap(cnf3,annot=True,cmap='summer',fmt='g')
acc3 = accuracy_score(y_val,y_pred3)
print("Accuracy on Model3 is: %0.3f"%acc3)
print("RF train accuracy: %0.3f" % model3.score(X_train, y_train))
print("RF test accuracy: %0.3f" % model3.score(X_val, y_val))

In [None]:
print(classification_report(y_val,y_pred3))

In [None]:
#Feature Importance
f1 = model3.steps[0][1].get_support()
new_f = [features[i] for i,val in enumerate(f1) if val==True]
print(new_f)
coef3 = model3.steps[1][1].feature_importances_
print(coef3.shape)
print(len(new_f))
coefs3 = pd.DataFrame({"Features":new_f,"Coefficients":coef3})
feature_imp3 = coefs3.sort_values(by='Coefficients',ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(y='Features',x='Coefficients',data=feature_imp3)

***`We were able to increase our performance of 94% on val data to 98.2% on val data just by using Feature Selection of Extratree classifier. Here we are using 11 features to train our model that too without any paramter tuning and we are able to match the Model2 performance where we used 31 variables and parameter tuning.`***

### Model4. Xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
model4 = XGBClassifier()
model4.fit(X_train, y_train)

In [None]:
y_pred4 = model4.predict(X_val)
cnf4 = confusion_matrix(y_val,y_pred4)
sns.heatmap(cnf4,annot=True,cmap='summer',fmt='g')
acc4 = accuracy_score(y_val,y_pred4)
print("Accuracy on Model3 is: %0.3f"%acc4)
print("RF train accuracy: %0.3f" % model4.score(X_train, y_train))
print("RF test accuracy: %0.3f" % model4.score(X_val, y_val))

In [None]:
print(classification_report(y_val,y_pred4))

In [None]:
coef4= model4.feature_importances_
print(coef4.shape)
print(len(features))
coefs4 = pd.DataFrame({"Features":features,"Coefficients":coef4})
feature_imp4 = coefs4.sort_values(by='Coefficients',ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(y='Features',x='Coefficients',data=feature_imp4)

***`We were able to increase our performance of 94% on val data(baseline model) to 98.8% on val data just by using Xgboost without any paramter tuning and we are able to match the Model2 performance where we used 31 variables and parameter tuning.
With only around 10 lines of code you can achieve the 98.8% accuracy on val data using Xgboost`***

### Model5. Light GBM

In [None]:
from lightgbm import LGBMClassifier
model5 = LGBMClassifier()
model5.fit(X_train, y_train)

In [None]:
y_pred5 = model5.predict(X_val)
cnf5 = confusion_matrix(y_val,y_pred5)
sns.heatmap(cnf5,annot=True,cmap='summer',fmt='g')
acc5 = accuracy_score(y_val,y_pred5)
print("Accuracy on Model5 is: %0.3f"%acc5)
print("Ligtgbm train accuracy: %0.3f" % model5.score(X_train, y_train))
print("LightGBM test accuracy: %0.3f" % model5.score(X_val, y_val))

In [None]:
print(classification_report(y_val,y_pred5))

In [None]:
coef5= model5.feature_importances_
print(coef5.shape)
print(len(features))
coefs5 = pd.DataFrame({"Features":features,"Coefficients":coef5})
feature_imp5 = coefs5.sort_values(by='Coefficients',ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(y='Features',x='Coefficients',data=feature_imp5)

***`We were able to increase our performance of 94% on val data(baseline model) to 98.8% on val data just by using LightGBM without any paramter tuning and we are able to match the Model2 performance where we used 31 variables and parameter tuning.
With only around 10 lines of code you can achieve the 98.8% accuracy on val data using LightGBM
Model Performance of XGboost and LightGBM are similar.`***