In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Backward stepwise
* Backward stepwise selection (or backward elimination) is a variable selection method which:
* Begins with a model that contains all variables under consideration (called the Full Model)
* Then starts removing the least significant variables one after the other
* Until a pre-specified stopping rule is reached or until no variable is left in the model
* From Full model we eliminate the least important features.

# Determine the least significant variable to remove at each step
The least significant variable is a variable that:
* Has the highest p-value in the model, or
* Its elimination from the model causes the lowest drop in R2, or
* Its elimination from the model causes the lowest increase in RSS (Residuals Sum of Squares) compared to other predictors.
# Choose a stopping rule
The stopping rule is satisfied when all remaining variables in the model have a p-value smaller than some pre-specified threshold.
When we reach this state, backward elimination will terminate and return the current step’s model.
# Where backward stepwise is better
* Starting with the full model has the advantage of considering the effects of all variables simultaneously.
* This is especially important in case of collinearity (when variables in a model are correlated which each other) because backward stepwise may be forced to keep them all in the model unlike forward selection where none of them might be entered
* Unless the number of candidate variables > sample size (or number of events), use a backward stepwise approach.

> https://quantifyinghealth.com/stepwise-selection/

# Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import plotting

#plotly 
import plotly.offline as py
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
from plotly import tools
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
import plotly.express as px

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import confusion_matrix,classification_report,precision_score
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style="whitegrid")

In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df=df.drop('Unnamed: 32', axis=1)

In [None]:
df.columns

In [None]:
diagnosis={'M':1, 'B':0}
df['diagnosis']=[diagnosis[x] for x in df['diagnosis']]

In [None]:
from scipy import stats


In [None]:
r, p=stats.pearsonr(df.radius_mean, df.diagnosis)
print(r)
print(p)

In [None]:
cor_df=pd.DataFrame(columns=['r','p-value'])
for col in df:
   # print(col)
    if pd.api.types.is_numeric_dtype(df[col]):
        r, p=stats.pearsonr(df.diagnosis, df[col])
        cor_df.loc[col]=[r, p]
cor_df

In [None]:
cor_df.sort_values(by=['p-value'], ascending=False)

In [None]:
col=['symmetry_se', 'texture_se']
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

# Logistic Regresion

In [None]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
lr=LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred=lr.predict(X_train)

In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

In [None]:
y_test_pred=lr.predict(x_test)

In [None]:
lr_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",lr_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm,
                               display_labels=lr.classes_)
disp.plot() 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_pred_prob=lr.predict_proba(x_test)[:,1]
y_test_pred_prob

from sklearn.metrics import roc_curve
metrics.roc_auc_score(y_test, y_test_pred_prob)


In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title("ROC_AUC")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
no_skill=len(y==1)/len(y)
y_test_prob=lr.predict_proba(x_test)[:,1]
plt.figure(figsize=(10,8))
plt.plot([0,1],[no_skill, no_skill], label="No Skill")
precision, recall,_ =precision_recall_curve(y_test, y_test_prob)
plt.plot(recall, precision, marker='',label="Logistic Regression")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Recall-Precision Curve")
plt.legend()
plt.show()

# K-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
error_rate=[]

for i in range(1,11):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred=knn.predict(x_test)
    error_rate.append(np.mean(pred!=y_test))
    
plt.figure(figsize=(15,10))
plt.plot(range(1,11), error_rate,marker='o', markersize=9)

# Conclusion:-
As we can see that the optimum K in KNN we get is K=3.

In [None]:
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [None]:
y_pred=knn.predict(X_train)


In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

# Conclusion: 
* As we can see that in my previsous notebook the accuracy was 76%, while in this case the accuracy is 87.43%.
* It is observed that removing the two feature help ub increasing the accuracy of the dataset using KNN model as prediction
* Log Loss is decreased as it was 8.482 and now it is 4.33
* ROC-AUC is improved , now it is 0.846 , which is much better than previous results in last notbook

In [None]:
y_test_pred=knn.predict(x_test)

In [None]:
knn_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",knn_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm,
                               display_labels=knn.classes_)
disp.plot() 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_pred_prob=knn.predict_proba(x_test)[:,1]
y_test_pred_prob

from sklearn.metrics import roc_curve
metrics.roc_auc_score(y_test, y_test_pred_prob)

In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title

In [None]:
from sklearn.metrics import precision_recall_curve
no_skill=len(y==1)/len(y)
y_test_prob=knn.predict_proba(x_test)[:,1]
plt.figure(figsize=(10,8))
plt.plot([0,1],[no_skill, no_skill], label="No Skill")
precision, recall,_ =precision_recall_curve(y_test, y_test_prob)
plt.plot(recall, precision, marker='',label="Knn")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Recall-Precision Curve")
plt.legend()
plt.show()

# SVC -Support Vector Classifier

In [None]:
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
svc=SVC() #Default hyperparameters
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

# Linear kernel

In [None]:
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

# RBF kernel

In [None]:
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [None]:
kernel=['linear', 'rbf', 'poly', 'sigmoid']
for i in kernel:
    svc=SVC(kernel=i)
    svc.fit(X_train,y_train)
    y_pred=svc.predict(X_test)
    print("{} :- {} ".format(i, metrics.accuracy_score(y_test,y_pred)))

* As we can see that the acuraccy score for Linear KERNAL is very well . SInce the we are getting higher accuracy , 
* it can be due to overfitting using LINEAR kernel.
* But as it is clear that using the "rbf" and "sigmoid" will function well and produces the 99% accuracy.


# Performing K-fold cross validation with different kernels

# Conclusion:-
* It is quite clear that it is overfitting
* SO we are not gonna use this kernel
* Look at the other kernels

In [None]:
from sklearn.model_selection import cross_val_score    
svc=SVC(kernel="rbf")
scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores)
print("-------------------")
print(scores.mean())

In [None]:
from sklearn.model_selection import cross_val_score    
svc=SVC(kernel="poly")
scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores)
print("-------------------")
print(scores.mean())

In [None]:
C_range=list(range(1,26))
acc_score=[]
for c in C_range:
    svc = SVC(kernel='rbf', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(range(1,26), acc_score)
print(acc_score)

In [None]:
C_range=list(range(1,26))
acc_score=[]
for c in C_range:
    svc = SVC(kernel='poly', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(range(1,26), acc_score)
print(acc_score)

In [None]:
C_range=list(range(1,26))
acc_score=[]
for c in C_range:
    svc = SVC(kernel='sigmoid', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(range(1,26), acc_score)
print(acc_score)

In [None]:
C_range=[0.001, 0.01, 0.1, 1.0, 10]
acc_score=[]
for c in C_range:
    svc = SVC(kernel='poly', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(C_range, acc_score)
print(acc_score)

In [None]:
C_range=[0.001, 0.01, 0.1, 1.0, 10]
acc_score=[]
for c in C_range:
    svc = SVC(kernel='rbf', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(C_range, acc_score)
print(acc_score)

In [None]:
C_range=[0.001, 0.01, 0.1, 1.0, 10]
acc_score=[]
for c in C_range:
    svc = SVC(kernel='sigmoid', C=c)
    scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.plot(C_range, acc_score)
print(acc_score)

# Conclusion:
* As we can see that as C value increase the accuracy is increasing respective of the KERNEl
* Since C is the hyperparameter
* As C increase the Overfitting occurs.
* A C decreases the underfitting occures.

In [None]:
svc=SVC(kernel='poly', C=1.0) #Default hyperparameters
svc.fit(X_train,y_train)
y_pred=svc.predict(X_train)

In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

In [None]:
y_test_pred=svc.predict(X_test)

In [None]:
svc_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",svc_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm,
                               display_labels=svc.classes_)
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_pred_prob=lr.predict_proba(X_test)[:,1]
y_test_pred_prob

from sklearn.metrics import roc_curve
metrics.roc_auc_score(y_test, y_test_pred_prob)

In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title("ROC_AUC")
plt.show()

# Decision Tree Clasifiers

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
X=df.drop(col, axis=1)
y=df['diagnosis']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(X_train, y_train)

In [None]:
y_pred_gini = clf_gini.predict(X_train)

In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred_gini))
print("F1 Score:-", metrics.f1_score(y_train, y_pred_gini))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred_gini))
print("Log Loss:-", metrics.log_loss(y_train, y_pred_gini))
print("Precision Score:-", metrics.precision_score(y_train, y_pred_gini))
print("Recall Score:-", metrics.recall_score(y_train, y_pred_gini))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred_gini))

In [None]:
y_test_pred= clf_gini.predict(X_test)

In [None]:
dc_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",dc_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm,
                               display_labels=clf_gini.classes_)
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_pred_prob=clf_gini.predict_proba(X_test)[:,1]
y_test_pred_prob

from sklearn.metrics import roc_curve
metrics.roc_auc_score(y_test, y_test_pred_prob)

In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title("ROC_AUC")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
no_skill=len(y==1)/len(y)
y_test_prob=lr.predict_proba(X_test)[:,1]
plt.figure(figsize=(10,8))
plt.plot([0,1],[no_skill, no_skill], label="No Skill")
precision, recall,_ =precision_recall_curve(y_test, y_test_prob)
plt.plot(recall, precision, marker='',label="Decision Tree")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Recall-Precision Curve")
plt.legend()
plt.show()

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

# Visualize decision-trees

In [None]:
plt.figure(figsize=(12,8))

from sklearn import tree

tree.plot_tree(clf_gini.fit(X_train, y_train)) 

# Decision Tree Classifier with criterion entropy¶

In [None]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


# fit the model
clf_en.fit(X_train, y_train)

In [None]:
error_rate=[]
for i in range(1,11):
    clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


    # fit the model
    clf_en.fit(X_train, y_train)
    pred=clf_en.predict(X_test)
    error_rate.append(np.mean(pred!=y_test))
    
plt.figure(figsize=(15,10))
plt.plot(range(1,11), error_rate,marker='o', markersize=9)
    

In [None]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


# fit the model
clf_en.fit(X_train, y_train)

In [None]:
y_pred=clf_en.predict(X_train)

In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

In [None]:
y_test_pred=clf_en.predict(X_test)


In [None]:
dc_en_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",dc_en_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm,
                               display_labels=clf_en.classes_)
disp.plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_pred_prob=clf_gini.predict_proba(X_test)[:,1]
y_test_pred_prob

from sklearn.metrics import roc_curve
metrics.roc_auc_score(y_test, y_test_pred_prob)

In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='DEcision Tree with Entropy')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title("ROC_AUC")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
no_skill=len(y==1)/len(y)
y_test_prob=lr.predict_proba(X_test)[:,1]
plt.figure(figsize=(10,8))
plt.plot([0,1],[no_skill, no_skill], label="No Skill")
precision, recall,_ =precision_recall_curve(y_test, y_test_prob)
plt.plot(recall, precision, marker='',label="Decision Tree with Entropy")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Recall-Precision Curve")
plt.legend()
plt.show()

# Random Forest Classifiers

In [None]:
# split data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [None]:
# import Random Forest classifier

from sklearn.ensemble import RandomForestClassifier



# instantiate the classifier 

rfc = RandomForestClassifier(random_state=0)



# fit the model

rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_train)


In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

# Conclusion 
* These are overfit model
* We need more features to remove

# What we can do ??
we have two options
* remove the feature which have correlation more than 0.9
* Another one is backward stepwise

Next, we compare the correlation between features and remove one of two features that have a correlation higher than 0.9

In [None]:
corr=df.corr()

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]
df = df[selected_columns]

* Now, the dataset has only those columns with correlation less than 0.9
* Next we will be selecting the columns based on how they affect the p-value. We are the removing the column diagnosis because it is the column we are trying to predict

In [None]:
selected_columns = selected_columns[1:].to_numpy()
# importing the statsmodels
import statsmodels.api as sm

# defing the backward elimination

def backwardElimination(x, Y, sl, columns):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    columns = np.delete(columns, j)
                    
    regressor_OLS.summary()
    return x, columns

SL = 0.05
data_modeled, selected_columns = backwardElimination(df.iloc[:,1:].values, df.iloc[:,0].values, SL, selected_columns)