## Constants

In [2]:
# DATASET_PATH = 'datasets/1000sample2015.csv' # Using the small dataset for now
DATASET_PATH = 'datasets/1000sample2015.csv' # Using the small dataset for now
FEATURES_JSON_PATH = 'datasets/features-small.json'

## Imports

In [176]:
import googleapiclient.discovery
import json
import numpy as np
import os
import pandas as pd
import pickle
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import Imputer
from fancyimpute import KNN as imputeKNN

## Data cleaning

In [262]:
with open(FEATURES_JSON_PATH, 'r') as file:
    features_json = json.loads(file.read())

# usecols: filter to only columns we want
with open(DATASET_PATH, 'r') as file:
#     raw_data = pd.read_csv(file, dtype=np.float32, usecols=features_json.keys(), nrows=50)
    raw_data = pd.read_csv(file, dtype=np.float32, usecols=features_json.keys())

# Fill invalid values with NaNs for imputation
raw_data_new = raw_data.copy()
for title, ser in raw_data.iteritems():
    invalid_vals = []
    
    # Build invalid values to be replaced
    for option in features_json[title]["options"]:
        if not option['valid']:
            invalid_vals.append(np.float32(option['value']))
    
    # Replace invalid values
    ser_new = ser.copy()
    for i, d in ser.iteritems():
        if d in invalid_vals:
            ser_new[i] = 'NaN'
    
    # Update dataframe
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# Impute NaNs with random sample from column
for title, ser in raw_data.iteritems():
    ser_sample = ser.dropna()
    ser_new = ser.copy()
    for i, d in ser.iteritems():
        if np.isnan(d):
            ser_new[i] = ser_sample.sample()
            
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# Cast floats to ints
raw_data_new = raw_data.astype(np.int32) # Make it all ints
raw_data = raw_data_new

# Update datatypes
for title, ser in raw_data.iteritems():
    if features_json[title]['type'] == 'discrete':
        ser_new = ser.astype('category')
    elif features_json[title]['type'] == 'boolean':
        bool_map = {1: True, 2: False}
        ser_new = ser.map(bool_map)
        ser_new = ser_new.astype('bool')
    elif features_json[title]['type'] == 'continuous':
        ser_new = ser
        # Already np.int32
    raw_data_new[title] = ser_new
raw_data = raw_data_new

# raw_data
df = raw_data
df


Unnamed: 0,CVDINFR4,CHCSCNCR,CHCOCNCR,ADDEPEV2,DIABETE3,SEX,_RACEGR3,_AGE_G,HTM4,WTKG3,_INCOMG,_SMOKER3
0,False,False,False,False,1,1,1,6,170,9934,5,4
1,False,False,False,False,3,2,1,6,157,6804,3,4
2,False,False,False,False,3,1,1,6,173,8528,4,3
3,False,False,False,False,3,1,1,2,168,9752,5,3
4,False,False,True,False,3,2,4,6,152,7711,2,4
5,False,True,False,False,3,2,1,6,170,9072,5,4
6,True,False,False,False,1,1,1,6,178,8074,3,3
7,True,False,False,False,1,2,1,6,168,7484,2,4
8,False,False,True,False,3,2,5,4,165,8528,5,3
9,False,False,False,False,3,2,1,6,155,8165,3,4


In [285]:
smoker = ((df['_SMOKER3'] == 1) | (df['_SMOKER3'] == 2) | (df['_SMOKER3'] == 3))
cancer = (df['CHCSCNCR'] | df['CHCOCNCR'])
# df[smoker & cancer].shape[0] # Count
print(df[smoker & cancer].shape[0] / df[smoker].shape[0])
print(df[~smoker & cancer].shape[0] / df[~smoker].shape[0])

0.15384615384615385
0.1681260945709282
