In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
df = pd.read_csv('/kaggle/input/breastcancerdataset/BRCA.csv')
df.head(5)

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [3]:
df.shape

(341, 16)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341 entries, 0 to 340
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          334 non-null    object 
 1   Age                 334 non-null    float64
 2   Gender              334 non-null    object 
 3   Protein1            334 non-null    float64
 4   Protein2            334 non-null    float64
 5   Protein3            334 non-null    float64
 6   Protein4            334 non-null    float64
 7   Tumour_Stage        334 non-null    object 
 8   Histology           334 non-null    object 
 9   ER status           334 non-null    object 
 10  PR status           334 non-null    object 
 11  HER2 status         334 non-null    object 
 12  Surgery_type        334 non-null    object 
 13  Date_of_Surgery     334 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      321 non-null    object 
dtypes: float

In [5]:
df.describe()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4
count,334.0,334.0,334.0,334.0,334.0
mean,58.886228,-0.029991,0.946896,-0.090204,0.009819
std,12.961212,0.563588,0.911637,0.585175,0.629055
min,29.0,-2.3409,-0.97873,-1.6274,-2.0255
25%,49.0,-0.358888,0.362173,-0.513748,-0.37709
50%,58.0,0.006129,0.992805,-0.17318,0.041768
75%,68.0,0.343598,1.6279,0.278353,0.42563
max,90.0,1.5936,3.4022,2.1934,1.6299


In [6]:
cols = df.columns
cols = cols.str.replace('catg_','')
cols = cols.str.replace('_',' ')
df.columns = cols
df.head(5)

Unnamed: 0,Patient ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour Stage,Histology,ER status,PR status,HER2 status,Surgery type,Date of Surgery,Date of Last Visit,Patient Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [7]:
static_data = df.drop('Age', axis= 1)
static_data = static_data.groupby('Protein4').sum().reset_index()

for col in static_data.columns[1:]:
    static_data[col] = static_data[col].apply(lambda x: 1 if x>0 else 0)
    
static_data.head(5)

Unnamed: 0,Protein4,Protein1,Protein2,Protein3
0,-2.0255,1,1,1
1,-1.8993,0,1,0
2,-1.7684,1,1,0
3,-1.7127,1,1,1
4,-1.6411,0,1,1


In [8]:
patterns = fpgrowth(static_data.iloc[:,1:], min_support= 0.00002, use_colnames= True)

frequent_sets = patterns.sort_values('support', ascending= False)[:10]
frequent_sets.support = frequent_sets.support * len(static_data)
frequent_sets



Unnamed: 0,support,itemsets
0,273.0,(Protein2)
1,170.0,(Protein1)
3,145.0,"(Protein1, Protein2)"
2,132.0,(Protein3)
4,85.0,"(Protein3, Protein2)"
5,66.0,"(Protein1, Protein3)"
6,47.0,"(Protein1, Protein3, Protein2)"


In [9]:
rules = association_rules(patterns, metric= 'confidence', min_threshold= 0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Protein1),(Protein2),0.510511,0.81982,0.435435,0.852941,1.040401,0.016909,1.225225
1,(Protein2),(Protein1),0.81982,0.510511,0.435435,0.531136,1.040401,0.016909,1.043989
2,(Protein3),(Protein2),0.396396,0.81982,0.255255,0.643939,0.785465,-0.069718,0.506038
3,(Protein2),(Protein3),0.81982,0.396396,0.255255,0.311355,0.785465,-0.069718,0.876509
4,(Protein1),(Protein3),0.510511,0.396396,0.198198,0.388235,0.979412,-0.004166,0.98666
5,(Protein3),(Protein1),0.396396,0.510511,0.198198,0.5,0.979412,-0.004166,0.978979
6,"(Protein1, Protein3)",(Protein2),0.198198,0.81982,0.141141,0.712121,0.868631,-0.021346,0.625889
7,"(Protein1, Protein2)",(Protein3),0.435435,0.396396,0.141141,0.324138,0.817712,-0.031464,0.893087
8,"(Protein3, Protein2)",(Protein1),0.255255,0.510511,0.141141,0.552941,1.083114,0.010831,1.094911
9,(Protein1),"(Protein3, Protein2)",0.510511,0.255255,0.141141,0.276471,1.083114,0.010831,1.029322
