In [2]:
import os
import pandas as pd
import numpy as np
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
import time
import itertools
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from functools import partial

In [73]:
data = pd.read_csv("/home/puneetj/CLA_Macro/MacroPy/data/german_data.csv")
del data["Unnamed: 0"]
data.head()
#data["am"].unique()

Unnamed: 0,ca_status,duration,credit_history,purpose,credit_amount,savings,present_employment_since,installment_rate_income,status_sex,other_debtors,...,age,other_installment,housing,existing_credits,job,liable_maintenance_people,telephone,foreign_worker,gb,tar
0,(;0DM),6,critical,radio/television,1169,unknown / no savings account,<7;),4,male:single,none,...,67,none,own,2,skilled/off.,1,yes,yes,good,1
1,<0DM;200DM),48,all paid,radio/television,5951,(;100DM),<1;4),2,female:div./sep./marr.,none,...,22,none,own,1,skilled/off.,1,none,yes,bad,0
2,No Acc.,12,critical,education,2096,(;100DM),<4;7),2,male:single,none,...,49,none,own,1,unsk. res.,2,none,yes,good,1
3,(;0DM),42,all paid,furniture/equipment,7882,(;100DM),<4;7),2,male:single,guarantor,...,45,none,for free,1,skilled/off.,2,none,yes,good,1
4,(;0DM),24,delay,car (new),4870,(;100DM),<1;4),3,male:single,none,...,53,none,for free,2,skilled/off.,2,none,yes,bad,0


In [71]:
class Logger(object):
    def __init__(self):
        pass

    def info(self, msg):
        print 'INFO: {0}'.format(msg)


class VariableSelector():
    #Currently only support's H2o and LightGBM for generating Feature Importance
    #either input the pandas dataframe using dataset or specify CSV location
    #the Specify addtional categories  are the columns which need to be treated as Category
    def __init__(self, Algorithm = "LightGBM",dataset = "", Input_Dir = "", Target ="",
                 Exclude_columns = "", Num_features = 20, preprocess = True,log = None,
                 spcfy_cat = []):
        self.Algorithm = Algorithm
        self.dataset = dataset
        self.data = self.load_data(dataset,Input_Dir)
        self.Input_Dir = Input_Dir
        self.log = self.setup_log(log)
        self.log.info('Run initiated.')
        self.Target =Target
        self.Exclude_columns = Exclude_columns
        self.spcfy_cat = spcfy_cat
        self.cols_to_use = self.get_cols_to_use(self.data,Target,Exclude_columns)
        self.Cat_columns = self.get_Cat_columns(self.data,Target,spcfy_cat,self.cols_to_use)
        self.variable_importance  = self.get_variable_importance(Algorithm,self.data,Target,Input_Dir,
                                                                 Exclude_columns,preprocess,self.cols_to_use,
                                                                 spcfy_cat,self.Cat_columns)
        
    def get_cols_to_use(self,data,Target,Exclude_columns):
        if isinstance(Exclude_columns, str):
            Exclude_columns = list(Exclude_columns)  
        return list(set(data.columns)-set([Target])- set(Exclude_columns))
    
    def get_Cat_columns(self,data,Target,spcfy_cat,cols_to_use):
        return list(set(data[cols_to_use].select_dtypes(include=['object']).columns)
                                         .union(set(spcfy_cat)))
    
    
    def get_variable_importance(self,Algorithm,dataset,Target,Input_Dir,
                                Exclude_columns,preprocess,cols_to_use,
                                spcfy_cat,Cat_columns):
        data = self.load_data(dataset,Input_Dir)
        self.log.info('Dataset Loaded')   
        data = self.preprocess_data(preprocess,self.Algorithm,data,spcfy_cat,self.Cat_columns) 
        self.data = data
        return self.build_model(data,Algorithm,Target,cols_to_use,Cat_columns)

        
    def build_model(self,data,Algorithm,Target,cols_to_use,Cat_columns):
        if Algorithm in ["H2o"]:
            model = H2ORandomForestEstimator(model_id="Random_forest_FI",
                                            ntrees=400,
                                            stopping_rounds=2,
                                            score_each_iteration=True,
                                            seed=1000000)
            model.train(cols_to_use,y= Target, training_frame=data)
            self.log.info('Model Build')  
            return model._model_json['output']['variable_importances'].as_data_frame()
        else:
            train_data = lgb.Dataset( data[cols_to_use],data[Target] ,feature_name = cols_to_use,
                                     categorical_feature=Cat_columns)
            train_data.set_categorical_feature(Cat_columns)
            self.log.info(train_data) 
            params = {
                            'task': 'train',
                            'boosting_type': 'gbdt',
                            'objective': 'regression',
                            #'metric': {'l2', 'auc'},
                            #'num_leaves': 31,
                            'learning_rate': 0.1,
                            'feature_fraction': 0.9,
                            'bagging_fraction': 0.8,
                            'bagging_freq': 5,
                            'verbose': 0,
                           # 'categorical_feature' : ['name:' + str(col) for col in Cat_columns]
                        }
            if len(data[Target].unique()) == 2:
                params["objective"] = 'binary'
            elif len(data[Target].unique()) < 10 :
                params["objective"] = 'multiclass'
            model = lgb.train(params,
                            train_data,
                            num_boost_round=200,
                           # valid_sets=lgb_eval,
                           # early_stopping_rounds=25
                             )
            self.log.info(params) 
            self.log.info('Model Build')  
            importances = model.feature_importance()
            #print (importances.shape)
            imp_list = []
            for row in zip(data[cols_to_use].columns, map(lambda x:round(x,4), importances)):
                imp_list.append(row)
            return (pd.DataFrame(imp_list, columns=['Column', 'Importance'])).sort_values(['Importance'], ascending = False)

        
        
        
    def setup_log(self, log):
        if log is None:
            log = Logger()
        return log
    
    def load_data(self,data,Input_Dir):
        if isinstance(data, pd.DataFrame):
            return data
        else:
            return pd.read_csv(Input_Dir)
    
    
        
    def preprocess_data(self,preprocess,Algorithm,data,spcfy_cat,Cat_columns):
        if preprocess == False:
            self.log.info("No Preprocessing Done")
        elif  Algorithm in ["H2o"]:
            h2o.init()
            h2o.remove_all() 
            self.log.info('Preprocessed Data h2o frame') 
            data = h2o.H2OFrame(data)
            for col in spcfy_cat:
                data[col] = data[col].asfactor()

            return data
              
        else:
            for c in Cat_columns:
                lbl = LabelEncoder()
                lbl.fit(list(data[c].values) )
                data[c] = lbl.transform(list(data[c].values))
                self.log.info('Preprocessed Data pd frame')
            return data
        
            
            
            
            
        

In [74]:
#data.dtypes
m = VariableSelector(Input_Dir = "/home/puneetj/CLA_Macro/MacroPy/data/german_data.csv",
                     Algorithm = "LightGBM",
                    # dataset = data,
                     Target ="tar",
                     preprocess = True,
                 #    Exclude_columns = ["status_sex"],
                  #  spcfy_cat = ["installment_rate_income"]
                    )
m.variable_importance
#h2o.init()
#h2o.shutdown(prompt=False)

INFO: Run initiated.
INFO: Dataset Loaded
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: Preprocessed Data pd frame
INFO: <lightgbm.basic.Dataset object at 0x7f1fb5119350>
INFO: {'categorical_column': [], 'task': 'train', 'verbose': 0, 'max_bin': 255, 'objective': 'binary', 'bagging_freq': 5, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'boosting_type': 'gbdt'}
INFO: Model Build


Unnamed: 0,Column,Importance
4,duration,406.0
17,age,303.0
8,Unnamed: 0,294.0
19,ca_status,275.0
14,purpose,270.0
18,credit_amount,267.0
11,credit_history,225.0
12,present_employment_since,123.0
3,gb,119.0
21,property,105.0


In [135]:
param = {"model_id":"Random_forest_FI",
         "ntrees":400,
         "stopping_rounds":2,
         "score_each_iteration":True,
         "seed"1000000}

In [175]:
spcfy_cat

Unnamed: 0.1,Unnamed: 0,ca_status,duration,credit_history,purpose,credit_amount,savings,present_employment_since,installment_rate_income,status_sex,...,age,other_installment,housing,existing_credits,job,liable_maintenance_people,telephone,foreign_worker,gb,tar
0,1,(;0DM),6,critical,radio/television,1169,unknown / no savings account,<7;),4,male:single,...,67,none,own,2,skilled/off.,1,yes,yes,good,1
1,2,<0DM;200DM),48,all paid,radio/television,5951,(;100DM),<1;4),2,female:div./sep./marr.,...,22,none,own,1,skilled/off.,1,none,yes,bad,0
2,3,No Acc.,12,critical,education,2096,(;100DM),<4;7),2,male:single,...,49,none,own,1,unsk. res.,2,none,yes,good,1
3,4,(;0DM),42,all paid,furniture/equipment,7882,(;100DM),<4;7),2,male:single,...,45,none,for free,1,skilled/off.,2,none,yes,good,1
4,5,(;0DM),24,delay,car (new),4870,(;100DM),<1;4),3,male:single,...,53,none,for free,2,skilled/off.,2,none,yes,bad,0
5,6,No Acc.,36,all paid,education,9055,unknown / no savings account,<1;4),2,male:single,...,35,none,for free,1,unsk. res.,2,yes,yes,good,1
6,7,No Acc.,24,all paid,furniture/equipment,2835,<500;1000),<7;),3,male:single,...,53,none,own,1,skilled/off.,1,none,yes,good,1
7,8,<0DM;200DM),36,all paid,car (used),6948,(;100DM),<1;4),2,male:single,...,35,none,rent,1,"mng/self emp, hig qual.",1,yes,yes,good,1
8,9,No Acc.,12,all paid,radio/television,3059,<1000;),<4;7),2,male:div./sep.,...,61,none,own,1,unsk. res.,1,none,yes,good,1
9,10,<0DM;200DM),30,critical,car (new),5234,(;100DM),unemployed,4,male:marr/wid.,...,28,none,own,2,"mng/self emp, hig qual.",1,none,yes,bad,0


In [33]:
for c in list(data.select_dtypes(include=['object']).columns):
    if c != 'ID':
        lbl = LabelEncoder()
        lbl.fit(list(data[c].values) )
        data[c] = lbl.transform(list(data[c].values))

In [44]:
cols_to_use =list(set(data.columns)- set(["tar"]))
start = time.time()
model_chk_r2.fit(data[cols_to_use], data.tar)
print ('Time taken: ', time.time() - start)

('Time taken: ', 4.455770969390869)


In [45]:
model_chk_r2.feature_importances_

array([ 0.00469475,  0.00420092,  0.00872196,  0.05263122,  0.01032276,
        0.01173249,  0.00640955,  0.0338489 ,  0.00279182,  0.0039686 ,
        0.01499422,  0.0068513 ,  0.00113663,  0.00758774,  0.72868679,
        0.02868227,  0.00734507,  0.00752421,  0.02093757,  0.0232123 ,
        0.01371892])

In [49]:
# model_gbm1.feature_importances_
importances = model_chk_r2.feature_importances_
print (importances.shape)
imp_list = []
for row in zip(data[cols_to_use].columns, map(lambda x:round(x,4), importances)):
#     print (type(row))
    imp_list.append(row)

print (len(imp_list))

imp_df = pd.DataFrame(imp_list, columns=['Column', 'Importance'])
#imp_df.to_csv("/home/puneetj/CLA_Macro/MacroPy/data/feature_importance.csv", index=False)
imp_df.sort_values(["Importance"],ascending = False)

(21,)
21


Unnamed: 0,Column,Importance
14,gb,0.7287
3,ca_status,0.0526
7,credit_amount,0.0338
15,duration,0.0287
19,age,0.0232
18,credit_history,0.0209
10,savings,0.015
20,purpose,0.0137
5,present_employment_since,0.0117
4,property,0.0103


In [1]:
             #specify max number of bytes. uses all cores by default.
#h2o.remove_all()   

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,72 days 11 hours 27 mins
H2O cluster version:,3.10.4.6
H2O cluster version age:,3 months and 12 days !!!
H2O cluster name:,H2O_started_from_R_govindm_pgr082
H2O cluster total nodes:,1
H2O cluster free memory:,25.67 Gb
H2O cluster total cores:,24
H2O cluster allowed cores:,24
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [134]:
data2 = h2o.H2OFrame(data) 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
data2.describe()

Rows:1000
Cols:22




Unnamed: 0,ca_status,duration,credit_history,purpose,credit_amount,savings,present_employment_since,installment_rate_income,status_sex,other_debtors,present_residence_since,property,age,other_installment,housing,existing_credits,job,liable_maintenance_people,telephone,foreign_worker,gb,tar
type,enum,int,enum,enum,int,enum,enum,int,enum,enum,int,enum,int,enum,enum,int,enum,int,enum,enum,enum,int
mins,,4.0,,,250.0,,,1.0,,,1.0,,19.0,,,1.0,,1.0,,,,0.0
mean,,20.903,,,3271.258,,,2.973,,,2.845,,35.546,,,1.407,,1.155,,,,0.7
maxs,,72.0,,,18424.0,,,4.0,,,4.0,,75.0,,,4.0,,2.0,,,,1.0
sigma,,12.0588144528,,,2822.73687596,,,1.11871467431,,,1.10371789566,,11.3754685743,,,0.577654468246,,0.362085771753,,,,0.45848687027
zeros,,0,,,0,,,0,,,0,,0,,,0,,0,,,,300
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,(;0DM),6.0,critical,radio/television,1169.0,unknown / no savings account,<7;),4.0,male:single,none,4.0,real estate,67.0,none,own,2.0,skilled/off.,1.0,yes,yes,good,1.0
1,<0DM;200DM),48.0,all paid,radio/television,5951.0,(;100DM),<1;4),2.0,female:div./sep./marr.,none,2.0,real estate,22.0,none,own,1.0,skilled/off.,1.0,none,yes,bad,0.0
2,No Acc.,12.0,critical,education,2096.0,(;100DM),<4;7),2.0,male:single,none,3.0,real estate,49.0,none,own,1.0,unsk. res.,2.0,none,yes,good,1.0


In [136]:
rf_v1 = H2ORandomForestEstimator(
    model_id="Random_forest_FI",
    ntrees=400,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=1000000)

TypeError: __init__() takes exactly 1 argument (2 given)

In [19]:
cols_to_use =list(set(data.columns)- set(["tar"]))
start = time.time()
rf_v1.train(cols_to_use,y= "tar", training_frame=data2)
#model_chk_r2.fit(data[cols_to_use], data.tar)
print ('Time taken: ', time.time() - start)
#rf_v1.train(covtype_X, covtype_y, training_frame=train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%
('Time taken: ', 0.4937901496887207)


In [20]:
rf_v1

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  Random_forest_FI


ModelMetricsRegression: drf
** Reported on train data. **

MSE: 0.00321748398688
RMSE: 0.0567228700515
MAE: 0.0138320860543
RMSLE: 0.0403112388629
Mean Residual Deviance: 0.00321748398688
Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-08-08 15:44:03,0.001 sec,0.0,,,
,2017-08-08 15:44:04,0.247 sec,1.0,0.1486588,0.0220994,0.0220994
,2017-08-08 15:44:04,0.276 sec,2.0,0.1370797,0.0212418,0.0187908
,2017-08-08 15:44:04,0.288 sec,3.0,0.1090126,0.0147903,0.0118837
,2017-08-08 15:44:04,0.299 sec,4.0,0.0861296,0.0119608,0.0074183
,2017-08-08 15:44:04,0.308 sec,5.0,0.0974126,0.0172921,0.0094892
,2017-08-08 15:44:04,0.312 sec,6.0,0.0815282,0.0139688,0.0066468
,2017-08-08 15:44:04,0.316 sec,7.0,0.0708558,0.0120313,0.0050205
,2017-08-08 15:44:04,0.324 sec,8.0,0.0656890,0.0109922,0.0043150


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
gb,1861.5407715,1.0,0.8291443
ca_status,188.5647430,0.1012950,0.0839882
duration,31.7272720,0.0170436,0.0141316
savings,31.1741180,0.0167464,0.0138852
credit_amount,22.4073524,0.0120370,0.0099804
---,---,---,---
existing_credits,2.3014801,0.0012363,0.0010251
housing,1.9251394,0.0010342,0.0008575
liable_maintenance_people,1.0450462,0.0005614,0.0004655



See the whole table with table.as_data_frame()




In [24]:
Variable_importance = rf_v1._model_json['output']['variable_importances'].as_data_frame()


Model Summary: 


0,1,2,3,4,5,6,7,8,9
,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,14.0,14.0,12707.0,1.0,11.0,6.0,2.0,39.0,17.285715


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2017-08-08 15:44:03,0.001 sec,0.0,,,
,2017-08-08 15:44:04,0.247 sec,1.0,0.1486588,0.0220994,0.0220994
,2017-08-08 15:44:04,0.276 sec,2.0,0.1370797,0.0212418,0.0187908
,2017-08-08 15:44:04,0.288 sec,3.0,0.1090126,0.0147903,0.0118837
,2017-08-08 15:44:04,0.299 sec,4.0,0.0861296,0.0119608,0.0074183
,2017-08-08 15:44:04,0.308 sec,5.0,0.0974126,0.0172921,0.0094892
,2017-08-08 15:44:04,0.312 sec,6.0,0.0815282,0.0139688,0.0066468
,2017-08-08 15:44:04,0.316 sec,7.0,0.0708558,0.0120313,0.0050205
,2017-08-08 15:44:04,0.324 sec,8.0,0.0656890,0.0109922,0.0043150



ModelMetricsRegression: drf
** Reported on train data. **

MSE: 0.00321748398688
RMSE: 0.0567228700515
MAE: 0.0138320860543
RMSLE: 0.0403112388629
Mean Residual Deviance: 0.00321748398688
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
gb,1861.5407715,1.0,0.8291443
ca_status,188.5647430,0.1012950,0.0839882
duration,31.7272720,0.0170436,0.0141316
savings,31.1741180,0.0167464,0.0138852
credit_amount,22.4073524,0.0120370,0.0099804
---,---,---,---
existing_credits,2.3014801,0.0012363,0.0010251
housing,1.9251394,0.0010342,0.0008575
liable_maintenance_people,1.0450462,0.0005614,0.0004655



See the whole table with table.as_data_frame()


{u'__meta': {u'schema_name': u'DRFModelOutputV3',
  u'schema_type': u'DRFOutput',
  u'schema_version': 3},
 u'cross_validation_fold_assignment_frame_id': None,
 u'cross_validation_holdout_predictions_frame_id': None,
 u'cross_validation_metrics': None,
 u'cross_validation_metrics_summary': None,
 u'cross_validation_models': None,
 u'cross_validation_predictions': None,
 u'domains': [[u'(;0DM)', u'<0DM;200DM)', u'<200DM;)', u'No Acc.'],
  None,
  [u'all paid', u'critical', u'delay', u'no credits', u'paid off'],
  [u'business',
   u'car (new)',
   u'car (used)',
   u'domestic appliances',
   u'education',
   u'furniture/equipment',
   u'others',
   u'radio/television',
   u'repairs',
   u'retraining'],
  None,
  [u'(;100DM)',
   u'<1000;)',
   u'<100;500)',
   u'<500;1000)',
   u'unknown / no savings account'],
  [u'(;1)', u'<1;4)', u'<4;7)', u'<7;)', u'unemployed'],
  None,
  [u'female:div./sep./marr.',
   u'male:div./sep.',
   u'male:marr/wid.',
   u'male:single'],
  [u'co-applicant', 