In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from yellowbrick.classifier import ROCAUC


from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

import logging
from datetime import datetime
from datetime import timedelta

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action="ignore", category=DeprecationWarning) 

import sweetviz as sv
import matplotlib.pyplot as plt
import json
import tqdm

import pickle

In [5]:
import mytransformer as myt

In [6]:
# create the log for this Notebook
now = datetime.now()
filename = 'logs/08-finalize-model' + now.strftime("%m%d%Y") + '.log'

logging.basicConfig(filename=filename ,format='%(asctime)s | %(levelname)s: %(message)s', level=20)
logging.info('')
logging.info('***************************************************************************************************')
logging.info('***                                                                                             ***')
logging.info('***   NEW RUN                                                                                   ***')
logging.info('***                                                                                             ***')
logging.info('***************************************************************************************************')



In [7]:
# folder = '../../data source/_anonymized/'
# declare some constants
folder = '_anonymized/'
sweetviz = True

# read the cleaned csv files

df = pd.read_csv(folder + 'dataset_4_modelling.csv')
try:
    df.drop('Unnamed: 0', axis=1, inplace=True)
except:
    logging.warning('Unnamed: 0 not existing in this file.')
    
logging.info('{} rows and {} columns read in the CSV file'.format(df.shape[0], df.shape[1]))
df.shape

(8902, 67)

In [8]:
trans=[]
dico= {}
dico['cols'] = ['NET_FLOWS','AUM']
dico['method'] = 'shift'
dico['period'] = [2,3,4]
dico['original'] = False
trans.append(dico)
dico= {}
dico['cols'] = ['IN_FLOWS']
dico['method'] = 'expanding'
dico['period'] = [2,4]
dico['original'] = False
trans.append(dico)
dico= {}
dico['cols'] = ['BENCH_PERF_SLOPE']
dico['method'] = 'rolling'
dico['period'] = [2,4]
dico['original'] = False
trans.append(dico)
dico= {}
dico['cols'] = ['EVENT_IMPACT']
dico['method'] = 'ewm'
dico['period'] = [2,3]
dico['original'] = False
trans.append(dico)

trans

[{'cols': ['NET_FLOWS', 'AUM'],
  'method': 'shift',
  'period': [2, 3, 4],
  'original': False},
 {'cols': ['IN_FLOWS'],
  'method': 'expanding',
  'period': [2, 4],
  'original': False},
 {'cols': ['BENCH_PERF_SLOPE'],
  'method': 'rolling',
  'period': [2, 4],
  'original': False},
 {'cols': ['EVENT_IMPACT'],
  'method': 'ewm',
  'period': [2, 3],
  'original': False}]

In [9]:
class TS2SL(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, transformation=None):

        self.cols=cols
        self.transformation=transformation
        
    #def preprocess_f(self, X_df, train_mean):
    def preprocess_f(self, X_df):
        # Work on a copy
        #print(self.cols)
        X_df = X_df[self.cols].copy()
        b_breakdown = np.unique(X_df['SRC_UID'])
        # Missing values in continuous features
        for trans in self.transformation:
            dico = trans
            for i, (k, v) in enumerate(dico.items()):
                logging.info('index {} - dico value for key {} is {}'.format(i, k, v))            
                # enumerate dictionary key and value

            for b in b_breakdown:
                f=X_df['SRC_UID']==b
                all_cols = dico['cols'].copy()

                for col in all_cols:
                    for p in dico['period']:
                        #print('change ',col,'p=',p)
                        if dico['method']=='shift':   
                            m='s'
                            X_df.loc[f,col+'_s_'+str(p)]=X_df.loc[f,col].shift(periods=p)
                        if dico['method']=='rolling': 
                            m='r'
                            X_df.loc[f,col+'_r_'+str(p)]=X_df.loc[f,col].rolling(p).mean()
                        if dico['method']=='expanding':   
                            m='x'
                            X_df.loc[f,col+'_x_'+str(p)]=X_df.loc[f,col].expanding(min_periods=p).mean()
                        if dico['method']=='ewm':  
                            m='e'
                            X_df.loc[f,col+'_e_'+str(p)]=X_df.loc[f,col].ewm(com=p).mean()


                        X_df.loc[f,col+'_'+m+'_'+str(p)]=X_df.loc[f,col+'_'+m+'_'+str(p)].interpolate(limit_direction='both')    

            for c in all_cols:
                X_df.drop(c,axis=1,inplace=True)

        X_df.drop('TARGET',axis=1,inplace=True)

        
        print('return X_df', X_df.shape)
        return X_df.fillna(0)

    def fit(self, X_df, y=None):
        # Check that we get a DataFrame
        
        assert type(X_df) == pd.DataFrame
        print('arriving in fit',X_df.shape)

        
        X_preprocessed = self.preprocess_f(X_df)

        # Save columns names/order for inference time
        self.columns_ = X_preprocessed.columns

        return self

    def transform(self, X_df):
        # Check that we get a DataFrame
        assert type(X_df) == pd.DataFrame

        # Preprocess data

        X_preprocessed = self.preprocess_f(X_df)

        # Make sure to have the same features
        X_reindexed = X_preprocessed.reindex(columns=self.columns_, fill_value=0)

        return X_reindexed

In [None]:

c=['SRC_UID','TARGET','IN_FLOWS','NET_FLOWS','OUT_FLOWS','AUM','IS_STRATEGIC','BENCH_PERF_SLOPE','EVENT_IMPACT']
temp_df=df[c].copy()
preprocessor = TS2SL(cols=c, transformation=trans)
preprocessor.fit(temp_df)
preprocessor.transform(temp_df.iloc[:30])

In [10]:
c=['SRC_UID','TARGET','IN_FLOWS','NET_FLOWS','OUT_FLOWS','AUM','IS_STRATEGIC','BENCH_PERF_SLOPE','EVENT_IMPACT']


# Use our custom transformer in a pipeline
pipe = Pipeline([
    ('preprocessor', TS2SL(cols=c, transformation=trans)),
    ('Scaler', StandardScaler()),
    ('estimator', LogisticRegression())
])

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

# Split data
X = df[c].drop('TARGET', axis=1)
X = df[c]
y = df.TARGET
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

# Evaluate estimator
pipe.fit(X_tr, y_tr)

pipe.score(X_te, y_te), X_te.shape, y_te.shape

arriving in fit (6231, 9)
return X_df (6231, 15)
return X_df (6231, 15)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


return X_df (2671, 15)


  Xt = transform.transform(Xt)


(0.4672407338075627, (2671, 9), (2671,))

In [11]:
pipe

Pipeline(memory=None,
     steps=[('preprocessor', TS2SL(cols=['SRC_UID', 'TARGET', 'IN_FLOWS', 'NET_FLOWS', 'OUT_FLOWS', 'AUM', 'IS_STRATEGIC', 'BENCH_PERF_SLOPE', 'EVENT_IMPACT'],
   transformation=[{'cols': ['NET_FLOWS', 'AUM'], 'method': 'shift', 'period': [2, 3, 4], 'original': False}, {'cols': ['IN_FLOWS'], 'method': 'e...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [12]:
pipe.predict(X_te[:3])

return X_df (3, 15)


  Xt = transform.transform(Xt)


array([ 1,  0, -1], dtype=int64)

In [13]:
X_te[:3]

Unnamed: 0,SRC_UID,TARGET,IN_FLOWS,NET_FLOWS,OUT_FLOWS,AUM,IS_STRATEGIC,BENCH_PERF_SLOPE,EVENT_IMPACT
8090,189669396589741,0,348112,246030,-102082,148494034,0,0.001688,2.496147
2554,323418458,1,26382634,12139507,-14243127,604148860,0,-0.004198,32.643325
4753,108202450315,1,177749960,97903022,-79846938,843190921,0,0.000813,7.976262


In [14]:
# define de columns based on business experience
cols_flows = ['IN_FLOWS', 'OUT_FLOWS','AUM']
cols_perfs = ['BENCH_PERF_SLOPE', 'BENCH_PERF_SLOPE_ERROR',
       'BENCH_VOL_SLOPE', 'BENCH_VOL_SLOPE_ERROR', 'NAV_PERF_SLOPE',
       'NAV_PERF_SLOPE_ERROR', 'NAV_VOL_SLOPE', 'NAV_VOL_SLOPE_ERROR']
cols_maktg = ['POSITIVE_RFP', 'EVENT_IMPACT', 'DOWNLOADED_DOCUMENTS','PSU_SCORE']
cols_funds = ['IS_STRATEGIC', 'ADMINSTRATION_FEES', 'MANAGEMENT_FEES','RISK_LEVEL_VALUE','STRATEGY_CAPABILITY','FUND_AGE']

all_cols =  cols_flows + cols_perfs + cols_maktg + cols_funds + ['TARGET', 'FLOWS_YEAR', 'FLOWS_MONTH', 'SRC_UID']
# create the transformer parameter
trans = []
for i in range(len(cols_flows)):
    dico= {}
    dico['cols'] = [cols_flows[i]]
    dico['method'] = 'rolling'
    dico['period'] = [2,3,6]
    dico['original'] = False
    trans.append(dico)
    

for i in range(len(cols_perfs)):
    dico= {}
    dico['cols'] = [cols_perfs[i]]
    dico['method'] = 'shift'
    dico['period'] = [2,3,5]
    dico['original'] = False
    trans.append(dico)
    
for i in range(len(cols_maktg)):
    dico= {}
    dico['cols'] = [cols_maktg[i]]
    dico['method'] = 'expanding'
    dico['period'] = [3,6,9]
    dico['original'] = False
    trans.append(dico)

In [15]:
c=all_cols
# Use our custom transformer in a pipeline
pipe = Pipeline([
    ('preprocessor', TS2SL(cols=c, transformation=trans)),
    ('Scaler', StandardScaler()), ('Power', PowerTransformer()),
    ('estimator', LogisticRegression(C=100,solver='lbfgs',class_weight='balanced', max_iter=1000))
])



# Split data
X = df[c].drop('TARGET', axis=1)
X = df[c]
y = df.TARGET
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

# Evaluate estimator
pipe.fit(X_tr, y_tr)

pipe.score(X_te, y_te), X_te.shape, y_te.shape

arriving in fit (6231, 25)
return X_df (6231, 54)
return X_df (6231, 54)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


return X_df (2671, 54)


  Xt = transform.transform(Xt)


(0.6098839385997754, (2671, 25), (2671,))

In [18]:
# save the model to disk
filename = 'models/finalized_model.sav'
pickle.dump(pipe, open(filename, 'wb'))

In [None]:
params = {
    'LR__C':[100,1000,10000],
    'LR__solver':['newton-cg','liblinear','lbfgs'],#
    'LR__class_weight':[None,'balanced']
}

In [None]:
from tqdm import tqdm_notebook as tqdm_final
for cv in tqdm_final([2,5,10]):
    grid = GridSearchCV(pipe, param_grid = params,  cv=cv, verbose=1)
    grid.fit(X_train, y_train)
    print(cv, grid.score(X_validation, y_validation))
    print(grid.best_params_)
    
# get the best estimator
best_estimator = grid.best_estimator_

In [None]:
pipe