In [17]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

# libraries
import numpy as np
import pandas as pd

# For Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#### Reading Cancer data

In [18]:
cancer_data = pd.read_csv("risk_rand.txt",header=None, delim_whitespace=True,
                          names=["menopaus","agegrp","density","race","hispanic","bmi","agefirst",
                                 "nrelbc","brstproc","lastmamm","surgmeno","hrt","invasive",
                                 "cancer","training","count"])
cancer_data.head()

Unnamed: 0,menopaus,agegrp,density,race,hispanic,bmi,agefirst,nrelbc,brstproc,lastmamm,surgmeno,hrt,invasive,cancer,training,count
0,0,1,1,1,0,1,0,0,0,0,9,9,0,0,1,3
1,0,1,1,1,0,1,0,0,0,9,9,9,0,0,0,2
2,0,1,1,1,0,1,0,0,0,9,9,9,0,0,1,3
3,0,1,1,1,0,1,1,0,0,0,9,9,0,0,1,1
4,0,1,1,1,0,1,1,0,1,9,9,9,0,0,1,1


### <span style="color:blue">1. Data Understanding</span>

In [19]:
# Inspecting number of rows and column of dataset
cancer_data.shape

(181903, 16)

In [20]:
# Inspecting entries, dtypes, number of columns of dataset
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181903 entries, 0 to 181902
Data columns (total 16 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   menopaus  181903 non-null  int64
 1   agegrp    181903 non-null  int64
 2   density   181903 non-null  int64
 3   race      181903 non-null  int64
 4   hispanic  181903 non-null  int64
 5   bmi       181903 non-null  int64
 6   agefirst  181903 non-null  int64
 7   nrelbc    181903 non-null  int64
 8   brstproc  181903 non-null  int64
 9   lastmamm  181903 non-null  int64
 10  surgmeno  181903 non-null  int64
 11  hrt       181903 non-null  int64
 12  invasive  181903 non-null  int64
 13  cancer    181903 non-null  int64
 14  training  181903 non-null  int64
 15  count     181903 non-null  int64
dtypes: int64(16)
memory usage: 22.2 MB


In [21]:
# Inspecting count of unknown (9) values in dataset
cols=cancer_data.columns
cols

Index(['menopaus', 'agegrp', 'density', 'race', 'hispanic', 'bmi', 'agefirst',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno', 'hrt', 'invasive',
       'cancer', 'training', 'count'],
      dtype='object')

In [22]:
for col in cols:
    print(cancer_data[col].value_counts())

1    140843
0     29542
9     11518
Name: menopaus, dtype: int64
4     30494
3     27018
5     23724
6     20310
7     18537
8     16962
2     14773
9     14037
10     9850
1      6198
Name: agegrp, dtype: int64
2    53027
9    46296
3    46126
1    20333
4    16121
Name: density, dtype: int64
1    103990
9     37367
2     13863
3     13236
5      7103
4      6344
Name: race, dtype: int64
0    103070
9     53376
1     25457
Name: hispanic, dtype: int64
9    73445
1    41009
2    33393
3    20510
4    13546
Name: bmi, dtype: int64
0    72136
9    63669
2    26191
1    19907
Name: agefirst, dtype: int64
0    113235
1     39338
9     24923
2      4407
Name: nrelbc, dtype: int64
0    110986
1     48946
9     21971
Name: brstproc, dtype: int64
0    102485
9     69861
1      9557
Name: lastmamm, dtype: int64
9    83545
0    58780
1    39578
Name: surgmeno, dtype: int64
0    65036
9    64489
1    52378
Name: hrt, dtype: int64
0    176991
1      4912
Name: invasive, dtype: int64
0    175629
1 

Many columns have value as 9 which means unknown.

### <span style="color:blue">2. Data Cleaning</span>

Since we are interested in both types of cancer - invasive and ductal carcinoma in setu, so we keep the "cancer" column and drop the "invasive" column

In [23]:
df = cancer_data.drop(["invasive"],axis=1)
df.head(5)

Unnamed: 0,menopaus,agegrp,density,race,hispanic,bmi,agefirst,nrelbc,brstproc,lastmamm,surgmeno,hrt,cancer,training,count
0,0,1,1,1,0,1,0,0,0,0,9,9,0,1,3
1,0,1,1,1,0,1,0,0,0,9,9,9,0,0,2
2,0,1,1,1,0,1,0,0,0,9,9,9,0,1,3
3,0,1,1,1,0,1,1,0,0,0,9,9,0,1,1
4,0,1,1,1,0,1,1,0,1,9,9,9,0,1,1


In [24]:
df.shape

(181903, 15)

In [25]:
#Drop the rows which contain 7 or more unknown values (9) in the columns
df["total_unknown"]=(df[['density', 'race', 'hispanic', 'bmi', 'agefirst','hrt',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno', 'menopaus']]==9).sum(axis=1) 

In [26]:
final = df[(df['total_unknown']<7)]

In [27]:
final.shape

(173330, 16)

Since all the columns have categorical values, they need to be one-hot encoded

In [28]:
df = final.drop(["training","cancer","count","total_unknown"], axis=1)
cols=df.columns
cols

Index(['menopaus', 'agegrp', 'density', 'race', 'hispanic', 'bmi', 'agefirst',
       'nrelbc', 'brstproc', 'lastmamm', 'surgmeno', 'hrt'],
      dtype='object')

In [29]:
# Creating a dummy variable for some of the categorical variables and later the unknown (9) would be dropped
dummy1 = pd.get_dummies(df, columns = cols, drop_first=False)
#dummy1 = pd.get_dummies(df, columns = cols, drop_first=True)
dummy1.head()

Unnamed: 0,menopaus_0,menopaus_1,menopaus_9,agegrp_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,...,brstproc_9,lastmamm_0,lastmamm_1,lastmamm_9,surgmeno_0,surgmeno_1,surgmeno_9,hrt_0,hrt_1,hrt_9
0,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
3,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1


In [30]:
#Drop the one with unknown
dummy1 = dummy1.drop(['density_9', 'race_9', 'hispanic_9', 'bmi_9', 'agefirst_9','nrelbc_9', 'brstproc_9', 'lastmamm_9',
                      'hrt_9','surgmeno_9','menopaus_9','agegrp_1'], axis=1)

dummy1.shape

(173330, 40)

In [31]:
# Adding the results to the master dataframe
final = pd.concat([final[['training','cancer']],dummy1], axis=1)
final.head()

Unnamed: 0,training,cancer,menopaus_0,menopaus_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
0,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [32]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173330 entries, 0 to 181795
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   training    173330 non-null  int64
 1   cancer      173330 non-null  int64
 2   menopaus_0  173330 non-null  uint8
 3   menopaus_1  173330 non-null  uint8
 4   agegrp_2    173330 non-null  uint8
 5   agegrp_3    173330 non-null  uint8
 6   agegrp_4    173330 non-null  uint8
 7   agegrp_5    173330 non-null  uint8
 8   agegrp_6    173330 non-null  uint8
 9   agegrp_7    173330 non-null  uint8
 10  agegrp_8    173330 non-null  uint8
 11  agegrp_9    173330 non-null  uint8
 12  agegrp_10   173330 non-null  uint8
 13  density_1   173330 non-null  uint8
 14  density_2   173330 non-null  uint8
 15  density_3   173330 non-null  uint8
 16  density_4   173330 non-null  uint8
 17  race_1      173330 non-null  uint8
 18  race_2      173330 non-null  uint8
 19  race_3      173330 non-null  uint8
 20  race

In [33]:
### Splitting into train and test on the basis of the training column

train = final[(final["training"]==1)]
y_train = train["cancer"]
y_train.head(2)

0    0
2    0
Name: cancer, dtype: int64

In [34]:
X_train=train.drop(["cancer","training"], axis=1)
X_train.head(2)

Unnamed: 0,menopaus_0,menopaus_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,agegrp_9,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [35]:
X_train.shape

(113829, 40)

In [36]:
y_train.value_counts()

0    109536
1      4293
Name: cancer, dtype: int64

In [37]:
test = final[(final["training"]==0)]
test.shape

(59501, 42)

In [38]:
X_test = test.drop(["training"], axis=1)
X_test.head(2)

Unnamed: 0,cancer,menopaus_0,menopaus_1,agegrp_2,agegrp_3,agegrp_4,agegrp_5,agegrp_6,agegrp_7,agegrp_8,...,nrelbc_1,nrelbc_2,brstproc_0,brstproc_1,lastmamm_0,lastmamm_1,surgmeno_0,surgmeno_1,hrt_0,hrt_1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [39]:
y_test = X_test.pop("cancer")
y_test.head(2)

1    0
9    0
Name: cancer, dtype: int64

In [40]:
y_test.value_counts()

0    57930
1     1571
Name: cancer, dtype: int64

In [41]:
len(y_train[(y_train==1)])*100/len(y_train.index)

3.771446643649685

In [42]:
import pickle
file = open("data_risk2_dummy","wb")
pickle.dump(X_train,file)
pickle.dump(y_train,file)
pickle.dump(X_test,file)
pickle.dump(y_test,file)
file.close()