In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

# libraries
import numpy as np
import pandas as pd

# For Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#### Reading Cancer data

In [2]:
cancer_data = pd.read_csv("risk_rand.txt",header=None, delim_whitespace=True,
                          names=["menopaus","agegrp","density","race","hispanic","bmi","agefirst",
                                 "nrelbc","brstproc","lastmamm","surgmeno","hrt","invasive",
                                 "cancer","training","count"])
cancer_data.head()

Unnamed: 0,menopaus,agegrp,density,race,hispanic,bmi,agefirst,nrelbc,brstproc,lastmamm,surgmeno,hrt,invasive,cancer,training,count
0,0,1,1,1,0,1,0,0,0,0,9,9,0,0,1,3
1,0,1,1,1,0,1,0,0,0,9,9,9,0,0,0,2
2,0,1,1,1,0,1,0,0,0,9,9,9,0,0,1,3
3,0,1,1,1,0,1,1,0,0,0,9,9,0,0,1,1
4,0,1,1,1,0,1,1,0,1,9,9,9,0,0,1,1


### <span style="color:blue">1. Data Understanding</span>

In [3]:
# Inspecting number of rows and column of dataset
cancer_data.shape

(181903, 16)

In [4]:
# Inspecting entries, dtypes, number of columns of dataset
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181903 entries, 0 to 181902
Data columns (total 16 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   menopaus  181903 non-null  int64
 1   agegrp    181903 non-null  int64
 2   density   181903 non-null  int64
 3   race      181903 non-null  int64
 4   hispanic  181903 non-null  int64
 5   bmi       181903 non-null  int64
 6   agefirst  181903 non-null  int64
 7   nrelbc    181903 non-null  int64
 8   brstproc  181903 non-null  int64
 9   lastmamm  181903 non-null  int64
 10  surgmeno  181903 non-null  int64
 11  hrt       181903 non-null  int64
 12  invasive  181903 non-null  int64
 13  cancer    181903 non-null  int64
 14  training  181903 non-null  int64
 15  count     181903 non-null  int64
dtypes: int64(16)
memory usage: 22.2 MB


In [5]:
# Inspecting count of unknown (9) values in dataset
cols=cancer_data.columns
cols

Index(['menopaus', 'agegrp', 'density', 'race', 'hispanic', 'bmi', 'agefirst',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno', 'hrt', 'invasive',
       'cancer', 'training', 'count'],
      dtype='object')

In [6]:
for col in cols:
    print(cancer_data[col].value_counts())

1    140843
0     29542
9     11518
Name: menopaus, dtype: int64
4     30494
3     27018
5     23724
6     20310
7     18537
8     16962
2     14773
9     14037
10     9850
1      6198
Name: agegrp, dtype: int64
2    53027
9    46296
3    46126
1    20333
4    16121
Name: density, dtype: int64
1    103990
9     37367
2     13863
3     13236
5      7103
4      6344
Name: race, dtype: int64
0    103070
9     53376
1     25457
Name: hispanic, dtype: int64
9    73445
1    41009
2    33393
3    20510
4    13546
Name: bmi, dtype: int64
0    72136
9    63669
2    26191
1    19907
Name: agefirst, dtype: int64
0    113235
1     39338
9     24923
2      4407
Name: nrelbc, dtype: int64
0    110986
1     48946
9     21971
Name: brstproc, dtype: int64
0    102485
9     69861
1      9557
Name: lastmamm, dtype: int64
9    83545
0    58780
1    39578
Name: surgmeno, dtype: int64
0    65036
9    64489
1    52378
Name: hrt, dtype: int64
0    176991
1      4912
Name: invasive, dtype: int64
0    175629
1 

Many columns have value as 9 which means unknown.

### <span style="color:blue">2. Data Cleaning</span>

Since we are interested in both types of cancer - invasive and ductal carcinoma in setu, so we keep the "cancer" column and drop the "invasive" column

In [7]:
cancer_data.drop(["invasive"],axis=1,inplace=True)
cancer_data.head(5)

Unnamed: 0,menopaus,agegrp,density,race,hispanic,bmi,agefirst,nrelbc,brstproc,lastmamm,surgmeno,hrt,cancer,training,count
0,0,1,1,1,0,1,0,0,0,0,9,9,0,1,3
1,0,1,1,1,0,1,0,0,0,9,9,9,0,0,2
2,0,1,1,1,0,1,0,0,0,9,9,9,0,1,3
3,0,1,1,1,0,1,1,0,0,0,9,9,0,1,1
4,0,1,1,1,0,1,1,0,1,9,9,9,0,1,1


The count specifies the number of patients with the same values. The id of these patients are removed for keeping them anonymous. So we need to replicate the rows as per the column "count"

In [8]:
df = cancer_data.rename(columns={'count':'ct'})

df = df.loc[np.repeat(df.index.values,df.ct)]

df = df.set_index(np.arange(0,len(df)))

df.head(10)

In [9]:
df.shape

(181903, 15)

In [10]:
predf = df[df.menopaus==0]
predf.shape

(29542, 15)

In [11]:
postdf=df[df.menopaus==1]
postdf.shape

(140843, 15)

In [12]:
predf=predf.drop(["menopaus"],axis=1)

In [13]:
postdf=postdf.drop(["menopaus"],axis=1)

In [14]:
#Drop the rows which contain 7 or more unknown values (9) in the columns
predf["total_unknown"]=(predf[['density', 'race', 'hispanic', 'bmi', 'agefirst','hrt',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno']]==9).sum(axis=1) 

In [15]:
predf = predf[(predf['total_unknown']<7)]

In [16]:
predf.shape

(27820, 15)

In [17]:
#Drop the rows which contain 7 or more unknown values (9) in the columns
postdf["total_unknown"]=(postdf[['density', 'race', 'hispanic', 'bmi', 'agefirst','hrt',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno']]==9).sum(axis=1) 

postdf = postdf[(postdf['total_unknown']<7)]

postdf.shape

(137243, 15)

Since all the columns have categorical values, they need to be one-hot encoded

In [18]:
prefinal = predf.drop(["training","cancer","ct","total_unknown"], axis=1)
cols=prefinal.columns
cols

Index(['agegrp', 'density', 'race', 'hispanic', 'bmi', 'agefirst', 'nrelbc',
       'brstproc', 'lastmamm', 'surgmeno', 'hrt'],
      dtype='object')

In [19]:
# Creating a dummy variable for some of the categorical variables and later the unknown (9) would be dropped
dummy1 = pd.get_dummies(prefinal, columns = cols, drop_first=False)
#dummy1 = pd.get_dummies(df, columns = cols, drop_first=True)
dummy1.head()

Unnamed: 0,agegrp_1,agegrp_2,agegrp_3,agegrp_4,density_1,density_2,density_3,density_4,density_9,race_1,...,nrelbc_2,nrelbc_9,brstproc_0,brstproc_1,brstproc_9,lastmamm_0,lastmamm_1,lastmamm_9,surgmeno_9,hrt_9
0,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,1
1,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,1
2,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,1
3,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,1
4,1,0,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,1,1


In [20]:
#Drop the one with unknown
dummy1 = dummy1.drop(['density_9', 'race_9', 'hispanic_9', 'bmi_9', 'agefirst_9','nrelbc_9', 'brstproc_9', 'lastmamm_9',
                      'hrt_9','surgmeno_9','agegrp_1'], axis=1)

dummy1.shape

(27820, 28)

In [21]:
# Adding the results to the master dataframe
final1 = pd.concat([predf[['training','cancer']],dummy1], axis=1)
final1.head()

Unnamed: 0,training,cancer,agegrp_2,agegrp_3,agegrp_4,density_1,density_2,density_3,density_4,race_1,...,agefirst_0,agefirst_1,agefirst_2,nrelbc_0,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1
0,1,0,0,0,0,1,0,0,0,1,...,1,0,0,1,0,0,1,0,1,0
1,0,0,0,0,0,1,0,0,0,1,...,1,0,0,1,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0,0,1,...,1,0,0,1,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,1,0,1,0
4,1,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,1,0,0


In [22]:
final1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27820 entries, 0 to 29498
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   training    27820 non-null  int64
 1   cancer      27820 non-null  int64
 2   agegrp_2    27820 non-null  uint8
 3   agegrp_3    27820 non-null  uint8
 4   agegrp_4    27820 non-null  uint8
 5   density_1   27820 non-null  uint8
 6   density_2   27820 non-null  uint8
 7   density_3   27820 non-null  uint8
 8   density_4   27820 non-null  uint8
 9   race_1      27820 non-null  uint8
 10  race_2      27820 non-null  uint8
 11  race_3      27820 non-null  uint8
 12  race_4      27820 non-null  uint8
 13  race_5      27820 non-null  uint8
 14  hispanic_0  27820 non-null  uint8
 15  hispanic_1  27820 non-null  uint8
 16  bmi_1       27820 non-null  uint8
 17  bmi_2       27820 non-null  uint8
 18  bmi_3       27820 non-null  uint8
 19  bmi_4       27820 non-null  uint8
 20  agefirst_0  27820 non-null  

In [23]:
### Splitting into train and test on the basis of the training column

train = final1[(final1["training"]==1)]
y_train = train["cancer"]
y_train.head(2)

0    0
2    0
Name: cancer, dtype: int64

In [24]:
X_train=train.drop(["cancer","training"], axis=1)
X_train.head(2)

Unnamed: 0,agegrp_2,agegrp_3,agegrp_4,density_1,density_2,density_3,density_4,race_1,race_2,race_3,...,agefirst_0,agefirst_1,agefirst_2,nrelbc_0,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1
0,0,0,0,1,0,0,0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
2,0,0,0,1,0,0,0,1,0,0,...,1,0,0,1,0,0,1,0,0,0


In [25]:
X_train.shape

(17723, 28)

In [26]:
y_train.value_counts()

0    17016
1      707
Name: cancer, dtype: int64

In [27]:
test = final1[(final1["training"]==0)]
test.shape

(10097, 30)

In [28]:
X_test = test.drop(["training"], axis=1)
X_test.head(2)

Unnamed: 0,cancer,agegrp_2,agegrp_3,agegrp_4,density_1,density_2,density_3,density_4,race_1,race_2,...,agefirst_0,agefirst_1,agefirst_2,nrelbc_0,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1
1,0,0,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,0
9,0,0,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0


In [29]:
y_test = X_test.pop("cancer")
y_test.head(2)

1    0
9    0
Name: cancer, dtype: int64

In [30]:
y_test.value_counts()

0    9829
1     268
Name: cancer, dtype: int64

In [31]:
len(y_train[(y_train==1)])*100/len(y_train.index)

3.9891666196467868

In [32]:
import pickle
file = open("data_premeno","wb")
pickle.dump(X_train,file)
pickle.dump(y_train,file)
pickle.dump(X_test,file)
pickle.dump(y_test,file)
file.close()

In [33]:
#Since all the columns have categorical values, they need to be one-hot encoded

postfinal = postdf.drop(["training","cancer","ct","total_unknown"], axis=1)
cols=postfinal.columns
cols

Index(['agegrp', 'density', 'race', 'hispanic', 'bmi', 'agefirst', 'nrelbc',
       'brstproc', 'lastmamm', 'surgmeno', 'hrt'],
      dtype='object')

In [34]:
# Creating a dummy variable for some of the categorical variables and later the unknown (9) would be dropped
dummy1 = pd.get_dummies(postfinal, columns = cols, drop_first=False)
#dummy1 = pd.get_dummies(df, columns = cols, drop_first=True)
dummy1.head()

Unnamed: 0,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,agegrp_10,density_1,density_2,...,brstproc_9,lastmamm_0,lastmamm_1,lastmamm_9,surgmeno_0,surgmeno_1,surgmeno_9,hrt_0,hrt_1,hrt_9
29542,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
29543,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
29544,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
29545,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
29546,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0


In [35]:
#Drop the one with unknown
dummy1 = dummy1.drop(['density_9', 'race_9', 'hispanic_9', 'bmi_9', 'agefirst_9','nrelbc_9', 'brstproc_9', 'lastmamm_9',
                      'hrt_9','surgmeno_9','agegrp_3'], axis=1)

dummy1.shape

(137243, 36)

In [36]:
# Adding the results to the master dataframe
final2 = pd.concat([postdf[['training','cancer']],dummy1], axis=1)
final2.head()

Unnamed: 0,training,cancer,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,agegrp_10,density_1,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
29542,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,1,0,1,0
29543,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,1,0,0,1
29544,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,1,1,0
29545,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,1,0,1
29546,1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,1,0,1


In [37]:
final2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137243 entries, 29542 to 170351
Data columns (total 38 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   training    137243 non-null  int64
 1   cancer      137243 non-null  int64
 2   agegrp_4    137243 non-null  uint8
 3   agegrp_5    137243 non-null  uint8
 4   agegrp_6    137243 non-null  uint8
 5   agegrp_7    137243 non-null  uint8
 6   agegrp_8    137243 non-null  uint8
 7   agegrp_9    137243 non-null  uint8
 8   agegrp_10   137243 non-null  uint8
 9   density_1   137243 non-null  uint8
 10  density_2   137243 non-null  uint8
 11  density_3   137243 non-null  uint8
 12  density_4   137243 non-null  uint8
 13  race_1      137243 non-null  uint8
 14  race_2      137243 non-null  uint8
 15  race_3      137243 non-null  uint8
 16  race_4      137243 non-null  uint8
 17  race_5      137243 non-null  uint8
 18  hispanic_0  137243 non-null  uint8
 19  hispanic_1  137243 non-null  uint8
 20  

In [38]:
### Splitting into train and test on the basis of the training column

train = final2[(final2["training"]==1)]
y_train = train["cancer"]
y_train.head(2)

29542    0
29543    0
Name: cancer, dtype: int64

In [39]:
X_train=train.drop(["cancer","training"], axis=1)
X_train.head(2)

Unnamed: 0,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,agegrp_10,density_1,density_2,density_3,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
29542,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,1,0,1,0
29543,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,1


In [40]:
X_train.shape

(90713, 36)

In [41]:
y_train.value_counts()

0    87280
1     3433
Name: cancer, dtype: int64

In [42]:
test = final2[(final2["training"]==0)]
test.shape

(46530, 38)

In [43]:
X_test = test.drop(["training"], axis=1)
X_test.head(2)

Unnamed: 0,cancer,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,agegrp_10,density_1,density_2,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
29545,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1
29550,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0


In [44]:
y_test = X_test.pop("cancer")
y_test.head(2)

29545    0
29550    0
Name: cancer, dtype: int64

In [45]:
y_test.value_counts()

0    45283
1     1247
Name: cancer, dtype: int64

In [46]:
len(y_train[(y_train==1)])*100/len(y_train.index)

3.7844630868783966

In [47]:
import pickle
file = open("data_postmeno","wb")
pickle.dump(X_train,file)
pickle.dump(y_train,file)
pickle.dump(X_test,file)
pickle.dump(y_test,file)
file.close()