## Constants

In [290]:
# DATASET_PATH = 'datasets/1000sample2015.csv' # Using the small dataset for now
DATASET_PATH = 'datasets/2015.csv'
CLEANED_DATASET_PATH = 'datasets/2015-cleaned.csv'
FEATURES_JSON_PATH = 'datasets/features-small.json'

## Imports

In [176]:
import googleapiclient.discovery
import json
import numpy as np
import os
import pandas as pd
import pickle
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Imputer
from fancyimpute import KNN as imputeKNN

## Data cleaning

In [287]:
with open(FEATURES_JSON_PATH, 'r') as file:
    features_json = json.loads(file.read())

# usecols: filter to only columns we want
with open(DATASET_PATH, 'r') as file:
#     raw_data = pd.read_csv(file, dtype=np.float32, usecols=features_json.keys(), nrows=50)
    raw_data = pd.read_csv(file, dtype=np.float32, usecols=features_json.keys())

# Fill invalid values with NaNs for imputation
raw_data_new = raw_data.copy()
for title, ser in raw_data.iteritems():
    invalid_vals = []
    
    # Build invalid values to be replaced
    for option in features_json[title]["options"]:
        if not option['valid']:
            invalid_vals.append(np.float32(option['value']))
    
    # Replace invalid values
    ser_new = ser.copy()
    for i, d in ser.iteritems():
        if d in invalid_vals:
            ser_new[i] = 'NaN'
    
    # Update dataframe
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# Impute NaNs with random sample from column
for title, ser in raw_data.iteritems():
    ser_sample = ser.dropna()
    ser_new = ser.copy()
    for i, d in ser.iteritems():
        if np.isnan(d):
            ser_new[i] = ser_sample.sample()
            
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# Cast floats to ints
raw_data_new = raw_data.astype(np.int32) # Make it all ints
raw_data = raw_data_new

# Update datatypes
for title, ser in raw_data.iteritems():
    if features_json[title]['type'] == 'discrete':
        ser_new = ser.astype('category')
    elif features_json[title]['type'] == 'boolean':
        bool_map = {1: True, 2: False}
        ser_new = ser.map(bool_map)
        ser_new = ser_new.astype('bool')
    elif features_json[title]['type'] == 'continuous':
        ser_new = ser
        # Already np.int32
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# raw_data
df = raw_data
df


Unnamed: 0,CVDINFR4,CHCSCNCR,CHCOCNCR,ADDEPEV2,DIABETE3,SEX,_RACEGR3,_AGE_G,HTM4,WTKG3,_INCOMG,_SMOKER3
0,False,False,False,True,3,2,1,5,178,12701,2,3
1,False,False,False,False,3,2,1,4,173,7484,1,1
2,False,False,True,False,3,2,1,6,180,7167,2,3
3,False,False,True,True,3,2,1,5,170,8165,5,4
4,False,False,False,False,3,2,1,5,163,6441,5,4
5,False,False,False,False,3,2,1,6,157,6577,4,4
6,False,False,False,False,3,2,4,6,168,6713,2,4
7,False,False,False,False,3,1,1,6,155,8119,2,3
8,False,False,False,False,3,2,1,6,160,3810,2,4
9,False,False,False,False,3,1,1,6,170,7303,5,3


In [291]:
with open(CLEANED_DATASET_PATH, 'w') as file:
    df.to_csv(file)

In [293]:
with open(CLEANED_DATASET_PATH, 'r') as file:
    df = pd.read_csv(file, index_col=0)
df

Unnamed: 0,CVDINFR4,CHCSCNCR,CHCOCNCR,ADDEPEV2,DIABETE3,SEX,_RACEGR3,_AGE_G,HTM4,WTKG3,_INCOMG,_SMOKER3
0,False,False,False,True,3,2,1,5,178,12701,2,3
1,False,False,False,False,3,2,1,4,173,7484,1,1
2,False,False,True,False,3,2,1,6,180,7167,2,3
3,False,False,True,True,3,2,1,5,170,8165,5,4
4,False,False,False,False,3,2,1,5,163,6441,5,4
5,False,False,False,False,3,2,1,6,157,6577,4,4
6,False,False,False,False,3,2,4,6,168,6713,2,4
7,False,False,False,False,3,1,1,6,155,8119,2,3
8,False,False,False,False,3,2,1,6,160,3810,2,4
9,False,False,False,False,3,1,1,6,170,7303,5,3


In [308]:
smoker_current = ((df['_SMOKER3'] == 1) | (df['_SMOKER3'] == 2))
smoker_all = ((df['_SMOKER3'] == 1) | (df['_SMOKER3'] == 2) | (df['_SMOKER3'] == 3))
cancer = (df['CHCOCNCR'])

def risk_ratio(x, y):
    x_true = df[x & y].shape[0] / df[x].shape[0]
    x_false = df[~x & y].shape[0] / df[~x].shape[0]
    x_risk = x_true / x_false
    return x_risk

# df[smoker & cancer].shape[0] # Count

print(risk_ratio(smoker_current, cancer))
print(risk_ratio(smoker_all, cancer))

0.8598613649203296
1.3908006783885283
