In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
import yaml 

In [2]:
class MLFlow():
    def __init__(self,df):
        self.data = df
    
    def remove_garbage_values(self, make_negatives_null_columns):
        self.data = self.data.replace(['nan'], np.nan)
        for col in make_negatives_null_columns:
            self.data[col] = self.data[col].apply(lambda x: np.nan if x < 0 else x)
            
    def generate_columns_yaml(self):
        cols_list = self.data.columns.tolist()
        cols_dict = {}
        for col in cols_list:
            five_values = str(self.data[col].sample(5).tolist())
            dtype = str(self.data[col].dtype)
            try:
                fill_rate = str(int(self.data[col].isnull().value_counts(normalize=True)[False]*100)) + ' %'
            except:
                fill_rate = '0 %'
            cols_dict[col] = {'5_sample_values':five_values, 'fill_rate': fill_rate, 'orig_dtype':dtype, 'recast_dtype':dtype, 'tag':'n'}
            yaml.dump(cols_dict, open('columns.yaml','w'), default_flow_style=False)
            pd.DataFrame(cols_dict).tanspose().to_excel('fill_rate.xlsx')
    
    def read_columns_yaml(self):
        columns_final = yaml.load(open('columns_final.yaml'))
        print(pd.DataFrame(columns_final).loc['tag',:].value_counts())
        self.categorical_columns = []
        self.numerical_columns = []
        self.target_column = ''
        for column in columns_final:
            if(columns_final[column]['tag'] == 'n'):
                self.numerical_columns.append(column)
            elif(columns_final[column]['tag'] == 'c'):
                self.categorical_columns.append(column)
            elif(columns_final[column]['tag'] == 't'):
                self.target_column = column
        for column in columns_final:
            if(columns_final[column]['orig_dtype'] != columns_final[column]['recast_dtype']):
                try:
                    self.data[column] = self.data[column].astype(columns_final[column]['recast_dtype'])
                    print(column, ' converted from', columns_final[column]['orig_dtype'], 'to', columns_final[column]['recast_dtype'])
                except:
                    print('ERROR converting ', column)
        selected_cols = self.categorical_columns + self.numerical_columns
        selected_cols_dict = {}
        for col in selected_cols:
            selected_cols_dict[col] = 'y'
        yaml.dump(selected_cols_dict, open('selected_features.yaml', 'w'), default_flow_style=False)
        selected_cols.append(self.target_column)
        self.data = self.data.loc[:,selected_cols]
        self.data = self.data.replace(['nan'], np.nan)
        print('Target Column:' , self.target_column)
        print('Categorical Columns:' , self.categorical_columns)
        print('Numerical Columns:', self.numerical_columns)

    def impute_missing(self):
        print('Data Size before dropping missing Target Variable', self.data.shape[0])
        self.data = self.data.loc[self.data[self.target_column].notnul(),:]
        print('Data Size after dropping missing Target Variable', self.data.shape[0])
        impute_values = {}
        for col in self.numerical_columns:
            try:
                impute_values[col] = self.data[col].median()
            except: 
                pass
        for col in self.categorical_columns:
            try:
                impute_values[col] = '<Missing'
            except:
                pass
        self.data = self.data.fillna(impute_values)
        
    def cap_outliers(self,outlier_cols):
        pass
    
    def print_categorical(self):
        for col in self.categorical_columns:
            print(self.data[col].value_counts(dropna=False))
    
    def club_rare(self):
        for col in self.categorical_columns:
            min_count = 10
            s = self.data[col].value_counts()
            self.data.loc[self.data[col].isin(s[s<min_count].index.tolist()),col] = 'RARE'
            
    def de_one_hot_encode(self,s):
        for col in self.categorical_columns:
            if(cols in s):
                return col
        return s
    
    def derive_features(self):
        import derived_features
        self.data = derived_features.derive_features(self.data)
    
    def xgb_reg_single(self,params,selected_features_file,segment,filter_col,thresh,selected_derived_features_file):
        from sklearn.model_selection import cross_validate
        from sklearn.metrics import mean_squared_error
        from sklearn.metrics import r2_score
        from xgboost.sklearn import XGBRegressor
        from sklearn.model_selection import cross_val_predict
        reg = XGBRegressor(n_estimators=params['n_estimators'], max_depth=params['max_depth'], \
                           shrinkage=params['shrinkage'], colsample_bynode=params['colsample_bynode'],\
                           n_jobs=params['n_jobs'])
        
        selected_features_dict = yaml.load(open(selected_features_file))
        selected_features = []
        for k,v in selected_features_dict.items():
            if(v == 'y'):
                selected_features.apppend(k)
        
        selected_derived_features_dict = yaml.load(open(selected_dderived_features_file))
        selected_derived_features = []
        for k,v in selected_derived_features_dict.items():
            if(v == 'y'):
                selected_features.append(k)
        
        X = pd.get_dummies(self.data.loc[selected_features])
        variables = X.columns.tolist()
        print('DATA SIZE: ', X.shape[0])
        X = X.values
        y = self.data['cur_liab_amt'].values
        
        scores = cross_validate(reg, X, y, cv=5,
                               scoring=('r2', 'neg_mean_squared_error'),
                               return_train_score=True)
        #print(scores)
        print('Train R2 Scores:' , scores['train_r2'])
        print('Test R2 Scores:' , scores['test_r2'])
        print('Train R2 Scores Mean:' , np.mean(scores['train_r2']))
        print('Test R2 Scores Mean:' , np.mean(scores['test_r2']))
        print('Train RMSE Scores:' , np.sqrt(-1*scores['train_neg_mean_squared_error']))
        print('Test RMSE Scores:' , np.sqrt(-1*scores['test_neg_mean_squared_error']))
        print('Train RMSE Scores Mean:', np.sqrt(-1*np.mean(scores['train_neg_mean_squared_error'])))
        print('Test RMSE Scores Mean:', np.sqrt(-1*np.mean(scores['test_neg_mean_squared_error'])))
        y_pred = cross_val_predict(reg, X, y, cv=5)
        
        df_train = pd.DataFrame({'actual':y, 'predicted': y_pred})
        df_train['actual_deciles'] = pd.qcut(df_train['actual'], 10, labels = (np.arange(10,0,-1)))
        df1 = df_train.groupby('actual_deciles').agg(['mean', 'count'])[['actual' , 'predicted']].sort_values('actual_deciles', ascending=False)
        df1['count'] = df1[('actual', 'count')]
        df1 = df1.drop([('actual', 'count'), ('predicted', 'count')], axis=1)
        df1 = df1.reset_index()
        df1.columns = df1.columns.droplevel(1)
        cols = ['Decile', 'Mean Actual', 'Mean Predicted', '# Obs']
        df1.columns = cols
        display(df1)
        
        error = pd.Series((np.abs(y-y_pred)*100/np.abs(y)))
        error_dist = {}
        for q in [0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]:
            error_dist[q] = error.quantile(q)
        display(pd.Series(error_dist).sort_index())
        reg.fit(X,y)
        features = pd.DataFrame({'Importance':reg.feature_importances_, 'Variable':variables})
        
        features['Variable'] = features['Variable'].apply(self.de_one_hot_encode)
        features = feature.groupby('Variable').sum().reset_index()
        features = features.sort_values('Importance', ascending=False)
        features = features[['Importance', 'Variable']]
        display(features)
        features.to_excel('feature_importances.xlsx')
        
        