### Breast Cancer Detection from Tissue Cell Diagnostics

In [86]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [87]:
# Importing Pandas and NumPy
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [89]:
# Importing Breast Cancer datasets
BCdata = pd.read_csv('../input/breastcancerdata.csv')
BCdata.head(5).transpose()

Unnamed: 0,0,1,2,3,4
id,842302,842517,84300903,84348301,84358402
diagnosis,M,M,M,M,M
radius_mean,17.99,20.57,19.69,11.42,20.29
texture_mean,10.38,17.77,21.25,20.38,14.34
perimeter_mean,122.8,132.9,130,77.58,135.1
area_mean,1001,1326,1203,386.1,1297
smoothness_mean,0.1184,0.08474,0.1096,0.1425,0.1003
compactness_mean,0.2776,0.07864,0.1599,0.2839,0.1328
concavity_mean,0.3001,0.0869,0.1974,0.2414,0.198
concave points_mean,0.1471,0.07017,0.1279,0.1052,0.1043


In [None]:
# Let's check the dimensions of the dataframe
print(BCdata.shape)
# Let's see the type of each column
print(BCdata.info())

In [None]:
# summarising number of missing values in each column
BCdata.isnull().sum()

In [None]:
# summarising number of missing values in each row
BCdata.isnull().sum(axis=1)

In [None]:
#checking for redundant duplicate rows
print(sum(BCdata.duplicated()))
#Dropping Duplicate Rows
BCdata.drop_duplicates(keep=False,inplace=True)
print(sum(BCdata.duplicated()))

In [None]:
#dropping columns having null value "Unnamed:32"
BCdata.drop(['Unnamed: 32'], axis = 1, inplace = True)

In [None]:
# let's look at the outliers for numeric features in dataframe
BCdata.describe(percentiles=[.25,.5,.75,.90,.95,.99]).transpose()

In [None]:
# correlation matrix
cor = BCdata.corr()
cor

In [None]:
# Plotting correlations on a heatmap post outlier treatment
# figure size
plt.figure(figsize=(20,15))
# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#pairplots for numerical data frames
plt.figure(figsize=(20,12))
sns.pairplot(BCdata)
plt.show()

In [None]:
# List of binary variables with M/B values using map converting these to 1/0
varlist =  ['diagnosis']

# Defining the map function
def binary_map(x):
    return x.map({'M': 1, 'B': 0})

# Applying the function to the leads score list
BCdata[varlist] = BCdata[varlist].apply(binary_map)

In [None]:
from sklearn.model_selection import train_test_split
# Putting feature variables to X by first dropping y (Attrition) from HRdata
X = BCdata.drop(['diagnosis'], axis=1)
# Putting response variable to y
y = BCdata['diagnosis']
print(y.head())

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
X.columns

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']] = scaler.fit_transform(X_train[['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']])

#verifying the scaled data in X_train dataframe
X_train.describe()

In [None]:
### Before we build the Logistic regression model, we need to know how much percent of Diagnosis as Malign is seen in the original data
### Calculating the Diagnosis Rate
DiagnosisRate = round((sum(BCdata['diagnosis'])/len(BCdata['diagnosis'].index))*100,2)
DiagnosisRate

In [None]:
import statsmodels.api as sm
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.feature_selection import RFE
rfe = RFE(logreg,10)             # running RFE with 20 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
#### Check for the VIF values of the feature variables.
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
##Removing all features showing high value in VIF exceeding value of 5, as this indicates high multi collinearity
col = col.drop('radius_worst',1)
col

In [None]:
#,'perimeter_mean','perimeter_worst','area_mean','area_worst','radius_se','perimeter_se'

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
## VIF AGAIN
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
##Removing all features showing high value in VIF exceeding value of 5, as this indicates high multi collinearity
col = col.drop('perimeter_worst',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
## VIF AGAIN
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
##Removing all features showing high value in VIF exceeding value of 5, as this indicates high multi collinearity
col = col.drop('area_se',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

In [None]:
## VIF AGAIN
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
##Removing all features showing high value in VIF exceeding value of 5, as this indicates high multi collinearity
col = col.drop('concave points_worst',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm6 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm6.fit()
res.summary()

In [None]:
## VIF AGAIN
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Diagnosis':y_train.values, 'Diagnosis_Probability':y_train_pred})
y_train_pred_final['PatientID'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Diagnosis_Probability.map(lambda x: 1 if x > 0.8 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Diagnosis, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Diagnosis, y_train_pred_final.predicted))

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
print("Sensitivity is:")
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
print("Specificity is:")
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting Conversion when customer does not Convert
print("False Positive Rate is:")
print(FP/ float(TN+FP))

In [None]:
# positive predictive value 
print("Positive Predictive value is:")
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print("Negative Predictive value is:")
print (TN / float(TN+ FN))

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Diagnosis, y_train_pred_final.Diagnosis_Probability, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Diagnosis, y_train_pred_final.Diagnosis_Probability)

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Diagnosis_Probability.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Diagnosis, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Diagnosis_Probability.map( lambda x: 1 if x > 0.35 else 0)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Diagnosis, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Diagnosis, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity - High specificity indicates the model can identify those who will not have attrition will have a negative test result.
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting Attrition when Employee is not Attrition
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

In [None]:
#Precision
confusion[1,1]/(confusion[0,1]+confusion[1,1])

In [None]:
#Recall
confusion[1,1]/(confusion[1,0]+confusion[1,1])

In [None]:
## Using sklearn to calculate above
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_pred_final.Diagnosis, y_train_pred_final.predicted)

In [None]:
recall_score(y_train_pred_final.Diagnosis, y_train_pred_final.predicted)

In [None]:
from sklearn.metrics import precision_recall_curve
y_train_pred_final.Diagnosis, y_train_pred_final.predicted
p, r, thresholds = precision_recall_curve(y_train_pred_final.Diagnosis, y_train_pred_final.Diagnosis_Probability)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_test[['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']] = scaler.fit_transform(X_test[['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']])

X_test = X_test[col]
X_test.head()

In [None]:
X_test.columns

In [None]:
X_test_sm = sm.add_constant(X_test)
# Making predictions on the test set
y_test_pred = res.predict(X_test_sm)
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)
# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Putting LeadID to index
y_test_df['PatientID'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Diagnosis_Probability'})

In [None]:
# Rearranging the columns
y_pred_final = y_pred_final.reindex_axis(['PatientID','diagnosis','Diagnosis_Probability'], axis=1)

In [None]:
# Let's see the head of y_pred_final
y_pred_final