# Dataiku Coding Part

In [None]:
import sys
import dataiku
import numpy as np
import pandas as pd
import sklearn as sk
import dataiku.core.pandasutils as pdu
from dataiku.doctor.preprocessing import PCA
from collections import defaultdict, Counter

In [None]:
pd.set_option('display.width', 3000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

data preparation

In [None]:
preparation_steps = []
preparation_output_schema = {u'userModified': False, u'columns': [{u'type': u'bigint', u'name': u'Age'}, {u'type': u'string', u'name': u'Sex'}, {u'type': u'string', u'name': u'ChestPainType'}, {u'type': u'bigint', u'name': u'RestingBP'}, {u'type': u'bigint', u'name': u'Cholesterol'}, {u'type': u'bigint', u'name': u'FastingBS'}, {u'type': u'string', u'name': u'RestingECG'}, {u'type': u'bigint', u'name': u'MaxHR'}, {u'type': u'boolean', u'name': u'ExerciseAngina'}, {u'type': u'double', u'name': u'Oldpeak'}, {u'type': u'string', u'name': u'ST_Slope'}, {u'type': u'bigint', u'name': u'HeartDisease'}]}

ml_dataset_handle = dataiku.Dataset('clean_dataset')
ml_dataset_handle.set_preparation_steps(preparation_steps, preparation_output_schema)
%time ml_dataset = ml_dataset_handle.get_dataframe(limit = 100000)

print ('Base data has %i rows and %i columns' % (ml_dataset.shape[0], ml_dataset.shape[1]))

In [None]:
#feature selection
ml_dataset = ml_dataset[[u'HeartDisease', u'ST_Slope', u'RestingBP', u'FastingBS', u'ChestPainType', u'Sex', u'Oldpeak', u'MaxHR', u'ExerciseAngina', u'Cholesterol', u'Age', u'RestingECG']]

data standardization

In [None]:
# astype('unicode') does not work as expected

def coerce_to_unicode(x):
    if sys.version_info < (3, 0):
        if isinstance(x, str):
            return unicode(x,'utf-8')
        else:
            return unicode(x)
    else:
        return str(x)


categorical_features = [u'ST_Slope', u'ChestPainType', u'Sex', u'ExerciseAngina', u'RestingECG']
numerical_features = [u'RestingBP', u'FastingBS', u'Oldpeak', u'MaxHR', u'Cholesterol', u'Age']
text_features = []
from dataiku.doctor.utils import datetime_to_epoch
for feature in categorical_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in text_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in numerical_features:
    if ml_dataset[feature].dtype == np.dtype('M8[ns]') or (hasattr(ml_dataset[feature].dtype, 'base') and ml_dataset[feature].dtype.base == np.dtype('M8[ns]')):
        ml_dataset[feature] = datetime_to_epoch(ml_dataset[feature])
    else:
        ml_dataset[feature] = ml_dataset[feature].astype('double')

In [None]:
target_map = {u'1': 1, u'0': 0}
ml_dataset['__target__'] = ml_dataset['HeartDisease'].map(str).map(target_map)
del ml_dataset['HeartDisease']


# Remove rows for which the target is unknown.
ml_dataset = ml_dataset[~ml_dataset['__target__'].isnull()]

feature processing

In [None]:
train, test = pdu.split_train_valid(ml_dataset, prop=0.8)

In [None]:
drop_rows_when_missing = []
impute_when_missing = [{'impute_with': u'MEAN', 'feature': u'RestingBP'}, {'impute_with': u'MEAN', 'feature': u'FastingBS'}, {'impute_with': u'MEAN', 'feature': u'Oldpeak'}, {'impute_with': u'MEAN', 'feature': u'MaxHR'}, {'impute_with': u'MEAN', 'feature': u'Cholesterol'}, {'impute_with': u'MEAN', 'feature': u'Age'}]

# Features for which we drop rows with missing values"
for feature in drop_rows_when_missing:
    train = train[train[feature].notnull()]
    test = test[test[feature].notnull()]
    print ('Dropped missing records in %s' % feature)

# Features for which we impute missing values"
for feature in impute_when_missing:
    if feature['impute_with'] == 'MEAN':
        v = train[feature['feature']].mean()
    elif feature['impute_with'] == 'MEDIAN':
        v = train[feature['feature']].median()
    elif feature['impute_with'] == 'CREATE_CATEGORY':
        v = 'NULL_CATEGORY'
    elif feature['impute_with'] == 'MODE':
        v = train[feature['feature']].value_counts().index[0]
    elif feature['impute_with'] == 'CONSTANT':
        v = feature['value']
    train[feature['feature']] = train[feature['feature']].fillna(v)
    test[feature['feature']] = test[feature['feature']].fillna(v)
    print ('Imputed missing values in feature %s with value %s' % (feature['feature'], coerce_to_unicode(v)))

In [None]:
LIMIT_DUMMIES = 100

categorical_to_dummy_encode = [u'ST_Slope', u'ChestPainType', u'Sex', u'ExerciseAngina', u'RestingECG']

# Only keep the top 100 values
def select_dummy_values(train, features):
    dummy_values = {}
    for feature in categorical_to_dummy_encode:
        values = [
            value
            for (value, _) in Counter(train[feature]).most_common(LIMIT_DUMMIES)
        ]
        dummy_values[feature] = values
    return dummy_values

DUMMY_VALUES = select_dummy_values(train, categorical_to_dummy_encode)

def dummy_encode_dataframe(df):
    for (feature, dummy_values) in DUMMY_VALUES.items():
        for dummy_value in dummy_values:
            dummy_name = u'%s_value_%s' % (feature, coerce_to_unicode(dummy_value))
            df[dummy_name] = (df[feature] == dummy_value).astype(float)
        del df[feature]
        print ('Dummy-encoded feature %s' % feature)

dummy_encode_dataframe(train)

dummy_encode_dataframe(test)

In [None]:
rescale_features = {u'RestingBP': u'AVGSTD', u'Age': u'AVGSTD', u'Oldpeak': u'AVGSTD', u'MaxHR': u'AVGSTD', u'Cholesterol': u'AVGSTD', u'FastingBS': u'AVGSTD'}
for (feature_name, rescale_method) in rescale_features.items():
    if rescale_method == 'MINMAX':
        _min = train[feature_name].min()
        _max = train[feature_name].max()
        scale = _max - _min
        shift = _min
    else:
        shift = train[feature_name].mean()
        scale = train[feature_name].std()
    if scale == 0.:
        del train[feature_name]
        del test[feature_name]
        print ('Feature %s was dropped because it has no variance' % feature_name)
    else:
        print ('Rescaled %s' % feature_name)
        train[feature_name] = (train[feature_name] - shift).astype(np.float64) / scale
        test[feature_name] = (test[feature_name] - shift).astype(np.float64) / scale

#### modeling process

In [None]:
train_X = train.drop('__target__', axis=1)
test_X = test.drop('__target__', axis=1)

train_Y = np.array(train['__target__'])
test_Y = np.array(test['__target__'])

###### logistic regression model 1

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l1",random_state=1337)

In [None]:
%time clf.fit(train_X, train_Y)
%time _predictions = clf.predict(test_X)
%time _probas = clf.predict_proba(test_X)
predictions = pd.Series(data=_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=test_X.index, columns=cols)

# Build scored dataset
results_test = test_X.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'HeartDisease'})
#performance
from dataiku.doctor.utils.metrics import mroc_auc_score
test_Y_ser = pd.Series(test_Y)

###### logistic regression model 2

In [None]:
#import dataset2 and run the previous steps 
preparation_steps = []
preparation_output_schema = {u'userModified': False, u'columns': [{u'type': u'bigint', u'name': u'Age'}, {u'type': u'string', u'name': u'Sex'}, {u'type': u'string', u'name': u'ChestPainType'}, {u'type': u'bigint', u'name': u'RestingBP'}, {u'type': u'bigint', u'name': u'Cholesterol'}, {u'type': u'bigint', u'name': u'FastingBS'}, {u'type': u'string', u'name': u'RestingECG'}, {u'type': u'bigint', u'name': u'MaxHR'}, {u'type': u'boolean', u'name': u'ExerciseAngina'}, {u'type': u'double', u'name': u'Oldpeak'}, {u'type': u'bigint', u'name': u'HeartDisease'}]}

ml_dataset_handle = dataiku.Dataset('clean_dataset_2')
ml_dataset_handle.set_preparation_steps(preparation_steps, preparation_output_schema)
%time ml_dataset = ml_dataset_handle.get_dataframe(limit = 100000)



In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l1",random_state=1337)
%time clf.fit(train_X, train_Y)
%time _predictions = clf.predict(test_X)
%time _probas = clf.predict_proba(test_X)
predictions = pd.Series(data=_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=test_X.index, columns=cols)

# Build scored dataset
results_test = test_X.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'HeartDisease'})
#performance
from dataiku.doctor.utils.metrics import mroc_auc_score
test_Y_ser = pd.Series(test_Y)

##### random forest model 1

In [None]:
train_X = train.drop('__target__', axis=1)
test_X = test.drop('__target__', axis=1)

train_Y = np.array(train['__target__'])
test_Y = np.array(test['__target__'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=155,
    random_state=1337,
    max_depth=13,
    min_samples_leaf=4,
    verbose=2)
%time clf.fit(train_X, train_Y)

In [None]:
%time _predictions = clf.predict(test_X)
%time _probas = clf.predict_proba(test_X)
predictions = pd.Series(data=_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=test_X.index, columns=cols)

# Build scored dataset
results_test = test_X.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'HeartDisease'})

In [None]:
feature_importances_data = []
features = train_X.columns
for feature_name, feature_importance in zip(features, clf.feature_importances_):
    feature_importances_data.append({
        'feature': feature_name,
        'importance': feature_importance
    })

# Plot the results
pd.DataFrame(feature_importances_data)\
    .set_index('feature')\
    .sort_values(by='importance')[-10::]\
    .plot(title='Top 10 most important variables',
          kind='barh',
          figsize=(10, 6),
          color='#348ABD',
          alpha=0.6,
          lw='1',
          edgecolor='#348ABD',
          grid=False,)
#performance
from dataiku.doctor.utils.metrics import mroc_auc_score
test_Y_ser = pd.Series(test_Y)
print ('AUC value:', mroc_auc_score(test_Y_ser, _probas))

##### random forest model 2

In [None]:
train_X = train.drop('__target__', axis=1)
test_X = test.drop('__target__', axis=1)

train_Y = np.array(train['__target__'])
test_Y = np.array(test['__target__'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=131,
    random_state=1337,
    max_depth=15,
    min_samples_leaf=1,
    verbose=2)

In [None]:
%time _predictions = clf.predict(test_X)
%time _probas = clf.predict_proba(test_X)
predictions = pd.Series(data=_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=test_X.index, columns=cols)

# Build scored dataset
results_test = test_X.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'HeartDisease'})

In [None]:
feature_importances_data = []
features = train_X.columns
for feature_name, feature_importance in zip(features, clf.feature_importances_):
    feature_importances_data.append({
        'feature': feature_name,
        'importance': feature_importance
    })

# Plot the results
pd.DataFrame(feature_importances_data)\
    .set_index('feature')\
    .sort_values(by='importance')[-10::]\
    .plot(title='Top 10 most important variables',
          kind='barh',
          figsize=(10, 6),
          color='#348ABD',
          alpha=0.6,
          lw='1',
          edgecolor='#348ABD',
          grid=False,)
#performance
from dataiku.doctor.utils.metrics import mroc_auc_score
test_Y_ser = pd.Series(test_Y)
print ('AUC value:', mroc_auc_score(test_Y_ser, _probas))