In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#read in csv using pandas
cancer_df = pd.read_csv("../data/cervical_cancer.csv")

In [3]:
#visualize
cancer_df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [4]:
# unanswered questions are filled in with a "?", so we need to replace with NaN and turn object into numeric type
cancer_nan = cancer_df.replace('?', np.nan)

In [5]:
#numeric_df = cancer_nan.astype('float64') the problem with this is that everything is converted to floats
#just want to convert object columns to numeric
numeric_df = cancer_nan.convert_objects(convert_numeric=True)

In [6]:
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
Age                                   858 non-null int64
Number of sexual partners             832 non-null float64
First sexual intercourse              851 non-null float64
Num of pregnancies                    802 non-null float64
Smokes                                845 non-null float64
Smokes (years)                        845 non-null float64
Smokes (packs/year)                   845 non-null float64
Hormonal Contraceptives               750 non-null float64
Hormonal Contraceptives (years)       750 non-null float64
IUD                                   741 non-null float64
IUD (years)                           741 non-null float64
STDs                                  753 non-null float64
STDs (number)                         753 non-null float64
STDs:condylomatosis                   753 non-null float64
STDs:cervical condylomatosis          753 non-null float64
STDs:vaginal

In [7]:
#check NaN counts
numeric_df.isnull().sum()

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

In [8]:
# Fill in  NAN values with mean, as done by authors of paper who collected this data

numeric_df['Number of sexual partners'] = numeric_df['Number of sexual partners'].fillna(numeric_df['Number of sexual partners'].mean())
numeric_df['First sexual intercourse'] = numeric_df['First sexual intercourse'].fillna(numeric_df['First sexual intercourse'].mean())
numeric_df['Num of pregnancies'] = numeric_df['Num of pregnancies'].fillna(numeric_df['Num of pregnancies'].mean())


In [9]:
#determine how to fill in Smokes values -- what is most common answer?
#Also, the fact that there are 13 missing values for smokes/smokesyears/smokes packs
#makes me believe that non-smokers left the columns blank
numeric_df["Smokes"].value_counts()

0.0    722
1.0    123
Name: Smokes, dtype: int64

In [10]:
numeric_df['Smokes'] = numeric_df['Smokes'].fillna(0)
numeric_df['Smokes (years)'] = numeric_df['Smokes (years)'].fillna(0)
numeric_df['Smokes (packs/year)'] = numeric_df['Smokes (packs/year)'].fillna(0)

In [11]:
numeric_df["Hormonal Contraceptives"].value_counts()

1.0    481
0.0    269
Name: Hormonal Contraceptives, dtype: int64

In [13]:
numeric_df['Hormonal Contraceptives'] = numeric_df['Hormonal Contraceptives'].fillna(1)
numeric_df['Hormonal Contraceptives (years)'] = numeric_df['Hormonal Contraceptives (years)'].fillna(numeric_df['Hormonal Contraceptives (years)'].mean())


In [14]:
numeric_df["IUD"].value_counts()

0.0    658
1.0     83
Name: IUD, dtype: int64

In [15]:
numeric_df['IUD'] = numeric_df['IUD'].fillna(0)
numeric_df['IUD (years)'] = numeric_df['IUD (years)'].fillna(0) 

In [16]:
numeric_df["STDs"].value_counts()

0.0    674
1.0     79
Name: STDs, dtype: int64

In [17]:
numeric_df['STDs'] = numeric_df['STDs'].fillna(0)
numeric_df['STDs (number)'] = numeric_df['STDs (number)'].fillna(0)

In [18]:
numeric_df['STDs:condylomatosis'].value_counts()

0.0    709
1.0     44
Name: STDs:condylomatosis, dtype: int64

In [19]:
numeric_df['STDs:condylomatosis'] = numeric_df['STDs:condylomatosis'].fillna(0)

In [20]:
numeric_df['STDs:cervical condylomatosis'].value_counts()

0.0    753
Name: STDs:cervical condylomatosis, dtype: int64

In [21]:
numeric_df['STDs:cervical condylomatosis'] = numeric_df['STDs:cervical condylomatosis'].fillna(0)

In [22]:
numeric_df['STDs:vaginal condylomatosis'] = numeric_df['STDs:vaginal condylomatosis'].fillna(0)
numeric_df['STDs:vulvo-perineal condylomatosis'] = numeric_df['STDs:vulvo-perineal condylomatosis'].fillna(0)
numeric_df['STDs:syphilis'] = numeric_df['STDs:syphilis'].fillna(0)
numeric_df['STDs:pelvic inflammatory disease'] = numeric_df['STDs:pelvic inflammatory disease'].fillna(0)
numeric_df['STDs:genital herpes'] = numeric_df['STDs:genital herpes'].fillna(0)
numeric_df['STDs:molluscum contagiosum'] = numeric_df['STDs:molluscum contagiosum'].fillna(0)
numeric_df['STDs:AIDS'] = numeric_df['STDs:AIDS'].fillna(0)
numeric_df['STDs:HIV'] = numeric_df['STDs:HIV'].fillna(0)
numeric_df['STDs:Hepatitis B'] = numeric_df['STDs:Hepatitis B'].fillna(0)
numeric_df['STDs:HPV'] = numeric_df['STDs:HPV'].fillna(0)


In [23]:
## I will fill in mean values for now for consistency, but i think we should just drop these columns all together because 
# it seems like no one really answered them --
numeric_df['STDs: Time since first diagnosis'] = numeric_df['STDs: Time since first diagnosis'].fillna(numeric_df['STDs: Time since first diagnosis'].mean())
numeric_df['STDs: Time since last diagnosis'] = numeric_df['STDs: Time since last diagnosis'].fillna(numeric_df['STDs: Time since last diagnosis'].mean())


In [24]:
# verify that there are no nulls left in the dataframe --

numeric_df.isnull().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0
