In [100]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")


In [101]:
#load data 
data=pd.read_csv("https://raw.githubusercontent.com/amankharwal/Website-data/master/BRCA.csv")
data.head()


Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [102]:
data.isnull().sum()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

In [103]:
data.shape

(341, 16)

In [104]:
#drop null value
data=data.dropna()

In [105]:
data.shape

(317, 16)

Data exploration

In [106]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float

In [107]:
data['Gender'].value_counts()

FEMALE    313
MALE        4
Name: Gender, dtype: int64

In [108]:
data['Tumour_Stage'].value_counts()

II     180
III     77
I       60
Name: Tumour_Stage, dtype: int64

In [109]:
data['Histology'].value_counts()

Infiltrating Ductal Carcinoma     224
Infiltrating Lobular Carcinoma     81
Mucinous Carcinoma                 12
Name: Histology, dtype: int64

In [110]:
#type of surgeries done on patient
data['Surgery_type'].value_counts()

Other                          97
Modified Radical Mastectomy    89
Lumpectomy                     66
Simple Mastectomy              65
Name: Surgery_type, dtype: int64

In [118]:
data['Patient_Status'].value_counts()

Alive    255
Dead      62
Name: Patient_Status, dtype: int64

In [111]:
data=data.drop (['Patient_ID','Date_of_Surgery','Date_of_Last_Visit'],axis=1)

In [144]:
data["Tumour_Stage"] = data["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
data["Histology"] = data["Histology"].map({"Infiltrating Ductal Carcinoma": 1,
"Infiltrating Lobular Carcinoma": 2,"Mucinous Carcinoma": 3})
data["ER status"] = data["ER status"].map({"Positive": 1})
data["PR status"] = data["PR status"].map({"Positive": 1})
data["HER2 status"] = data["HER2 status"].map({"Positive": 1, "Negative": 2})
data["Gender"] = data["Gender"].map({"MALE": 0, "FEMALE": 1})
data["Surgery_type"] = data["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2,
"Lumpectomy": 3, "Simple Mastectomy": 4})
print(data.head())


    Age  Gender  Protein1  Protein2  Protein3  Protein4  Tumour_Stage  \
0  36.0     NaN  0.080353   0.42638   0.54715  0.273680           NaN   
1  43.0     NaN -0.420320   0.57807   0.61447 -0.031505           NaN   
2  69.0     NaN  0.213980   1.31140  -0.32747 -0.234260           NaN   
3  56.0     NaN  0.345090  -0.21147  -0.19304  0.124270           NaN   
4  56.0     NaN  0.221550   1.90680   0.52045 -0.311990           NaN   

   Histology  ER status  PR status  HER2 status  Surgery_type Patient_Status  
0        NaN        NaN        NaN          NaN           NaN          Alive  
1        NaN        NaN        NaN          NaN           NaN           Dead  
2        NaN        NaN        NaN          NaN           NaN          Alive  
3        NaN        NaN        NaN          NaN           NaN          Alive  
4        NaN        NaN        NaN          NaN           NaN           Dead  


In [None]:
X=data.drop('Patient_Status',axis=1)

In [146]:
X

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type
0,36.0,1,0.080353,0.42638,0.54715,0.273680,3,1,1,1,2,2
1,43.0,1,-0.420320,0.57807,0.61447,-0.031505,2,3,1,1,2,3
2,69.0,1,0.213980,1.31140,-0.32747,-0.234260,3,1,1,1,2,1
3,56.0,1,0.345090,-0.21147,-0.19304,0.124270,2,1,1,1,2,2
4,56.0,1,0.221550,1.90680,0.52045,-0.311990,2,1,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
329,36.0,1,0.231800,0.61804,-0.55779,-0.517350,3,1,1,1,1,4
330,44.0,0,0.732720,1.11170,-0.26952,-0.354920,2,2,1,1,2,1
331,61.0,1,-0.719470,2.54850,-0.15024,0.339680,2,1,1,1,2,3
332,79.0,1,0.479400,2.05590,-0.53136,-0.188480,1,1,1,1,1,3


In [147]:
y=data['Patient_Status']

In [148]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
y=label.fit_transform(y)

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [150]:
#target variable is imbalanced , oversample the dataset
from imblearn.over_sampling import SMOTE
from collections import Counter
# summarize class distribution
print("Before oversampling: ",Counter(y_train))

# define oversampling strategy
SMOTE = SMOTE()

# fit and apply the transform
X_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(X_train, y_train)

# summarize class distribution
print("After oversampling: ",Counter(y_train_SMOTE))

Before oversampling:  Counter({0: 202, 1: 51})
After oversampling:  Counter({0: 202, 1: 202})


In [151]:
#modelling
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
model=SVC()
clf_SMOTE = model.fit(X_train_SMOTE, y_train_SMOTE)
pred_SMOTE = clf_SMOTE.predict(X_test)

print("ROC AUC score for oversampled SMOTE data: ", roc_auc_score(y_test, pred_SMOTE))

ROC AUC score for oversampled SMOTE data:  0.4142367066895369


In [139]:
X_train_SMOTE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 21 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Age                                       404 non-null    float64
 1   Protein1                                  404 non-null    float64
 2   Protein2                                  404 non-null    float64
 3   Protein3                                  404 non-null    float64
 4   Protein4                                  404 non-null    float64
 5   Gender_FEMALE                             404 non-null    uint8  
 6   Gender_MALE                               404 non-null    uint8  
 7   Tumour_Stage_I                            404 non-null    uint8  
 8   Tumour_Stage_II                           404 non-null    uint8  
 9   Tumour_Stage_III                          404 non-null    uint8  
 10  Histology_Infiltrating Ductal Carcinom

In [152]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test,pred_SMOTE)
conf_mat

array([[15, 38],
       [ 5,  6]], dtype=int64)

In [153]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]


In [154]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive+false_negative + true_negative)
Accuracy


0.328125