In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, log_loss
from yellowbrick.classifier import ROCAUC


from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

import logging
from datetime import datetime
from datetime import timedelta

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
from pandas.core.common import SettingWithCopyWarning
from sklearn.exceptions import DataConversionWarning
simplefilter(action='ignore', category=DataConversionWarning)
simplefilter(action="ignore", category=SettingWithCopyWarning)
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action="ignore", category=DeprecationWarning) 

import sweetviz as sv
import matplotlib.pyplot as plt
import json
import tqdm
from tqdm import tqdm_notebook as tqdm_final

import pickle

In [2]:
import mytransformer as myt
from mytransformer import TS2SL

In [3]:
### Load the data
# folder = '../../data source/_anonymized/'
# declare some constants
folder = '_anonymized/'
sweetviz = True

# read the cleaned csv files

df = pd.read_csv(folder + 'dataset_4_modelling.csv')
try:
    df.drop('Unnamed: 0', axis=1, inplace=True)
except:
    logging.warning('Unnamed: 0 not existing in this file.')
    
logging.info('{} rows and {} columns read in the CSV file'.format(df.shape[0], df.shape[1]))
df.shape

(8902, 67)

In [4]:
### Load the Model
# load the model from disk
filename = 'models/finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [5]:
# funds caracteristics
cols_funds = ['IS_STRATEGIC', 'ADMINSTRATION_FEES', 'MANAGEMENT_FEES','RISK_LEVEL_VALUE','STRATEGY_CAPABILITY','FUND_AGE']
# define de columns based on business experience
cols_rolling = ['NAV_VOL_SLOPE', 'NAV_PERF_SLOPE','BENCH_PERF_SLOPE',
                'BENCH_VOL_SLOPE','AUM','NET_FLOWS','IN_FLOWS','OUT_FLOWS'] 
cols_shift = [ 'BENCH_PERF_SLOPE_ERROR',
        'BENCH_VOL_SLOPE_ERROR',
       'NAV_PERF_SLOPE_ERROR',  'NAV_VOL_SLOPE_ERROR']
cols_ewm = ['POSITIVE_RFP', 'EVENT_IMPACT', 'DOWNLOADED_DOCUMENTS','PSU_SCORE']
cols_expanding = []
all_cols =  cols_rolling + cols_shift + cols_ewm + cols_expanding + ['TARGET', 'FLOWS_YEAR', 'FLOWS_MONTH', 'SRC_UID'] + cols_funds

# create the transformer parameter
trans = []
for i in range(len(cols_rolling)):
    dico= {}
    dico['cols'] = [cols_rolling[i]]
    dico['method'] = 'rolling'
    dico['period'] = [2,3,4]
    dico['original'] = False
    trans.append(dico)


for i in range(len(cols_shift)):
    dico= {}
    dico['cols'] = [cols_shift[i]]
    dico['method'] = 'shift'
    dico['period'] = [1,2,3]
    dico['original'] = False
    trans.append(dico)

for i in range(len(cols_ewm)):
    dico= {}
    dico['cols'] = [cols_ewm[i]]
    dico['method'] = 'ewm'
    dico['period'] = [1,3,6]
    dico['original'] = False
    trans.append(dico)


for i in range(len(cols_expanding)):
    dico= {}
    dico['cols'] = [cols_expanding[i]]
    dico['method'] = 'ewm'
    dico['period'] = [1,3,6]
    dico['original'] = False
    trans.append(dico)    


In [6]:
np.unique(df['SRC_UID'])
f=df['SRC_UID']>=1489764652
df.loc[f,:]

Unnamed: 0,SRC_UID,IS_STRATEGIC,ADMINSTRATION_FEES,MANAGEMENT_FEES,MULTI_MANAGER_STRUCTURE,EXPERIENCE,SOFTCLOSING,CNT_SHARE,QUANTITY,IN_FLOWS,...,FUND_AGE_BIN,DOWNLOADED_DOCUMENTS_BIN,BENCH_PERF_SLOPE,BENCH_PERF_SLOPE_ERROR,BENCH_VOL_SLOPE,BENCH_VOL_SLOPE_ERROR,NAV_PERF_SLOPE,NAV_PERF_SLOPE_ERROR,NAV_VOL_SLOPE,NAV_VOL_SLOPE_ERROR
2946,1489764652,0,0.0007,0.002507,1,1300.202108,0,21,6690975,22343464,...,3,1,-0.054510,0.147207,-0.034382,0.237361,-0.054030,0.146279,-0.035239,0.236800
2947,1489764652,0,0.0007,0.002507,1,1300.202108,0,21,6660840,39273363,...,3,1,-0.062807,0.242574,-0.014345,0.178211,-0.062274,0.239048,-0.015518,0.180422
2948,1489764652,0,0.0007,0.002507,1,1300.202108,0,21,6846161,81331215,...,3,1,-0.062311,0.200959,-0.011202,0.173876,-0.060962,0.195813,-0.012657,0.175470
2949,1489764652,0,0.0007,0.002507,1,1300.202108,0,21,6726841,49988316,...,3,1,-0.040184,0.058329,-0.000771,0.203269,-0.038722,0.056320,-0.001510,0.205597
2950,1489764652,0,0.0007,0.002507,1,1300.202108,0,21,6084198,7394583,...,3,1,0.014469,-0.079036,0.002218,0.193706,0.014748,-0.078383,0.000878,0.194448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8897,213479867994200,1,0.0003,0.005000,0,0.000000,0,8,4262464,8479346,...,1,0,0.002099,-0.006360,0.000035,0.000057,-0.017896,0.060428,-0.001291,0.055365
8898,213479867994200,1,0.0003,0.005000,0,0.000000,0,8,4496501,14776729,...,1,0,0.002086,-0.006345,0.000063,0.000007,-0.015975,0.034633,-0.012080,0.070153
8899,213479867994200,1,0.0003,0.005000,0,0.000000,0,8,4735747,24501245,...,1,0,0.002108,-0.006426,0.000040,0.000049,-0.011221,0.055799,-0.003275,0.059975
8900,213479867994200,1,0.0003,0.005000,0,0.000000,0,8,5097105,43874436,...,1,0,0.002106,-0.006438,0.000051,0.000043,-0.018773,0.074239,-0.005920,0.062375


In [7]:
# Split data
c=all_cols
#X = df.loc[f,c].drop('TARGET', axis=1)
X = df.loc[f,c]
y = df.loc[f,'TARGET']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=0)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((4467, 26), (1489, 26), (4467,), (1489,))

In [8]:
loaded_model.fit(X_tr, y_tr)

arriving in fit (4467, 26)


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


entering in transform (4467, 26)


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




Pipeline(memory=None,
     steps=[('preprocessor', TS2SL(cols=['NAV_VOL_SLOPE', 'NAV_PERF_SLOPE', 'BENCH_PERF_SLOPE', 'BENCH_VOL_SLOPE', 'AUM', 'NET_FLOWS', 'IN_FLOWS', 'OUT_FLOWS', 'BENCH_PERF_SLOPE_ERROR', 'BENCH_VOL_SLOPE_ERROR', 'NAV_PERF_SLOPE_ERROR', 'NAV_VOL_SLOPE_ERROR', 'POSITIVE_RFP', 'EVENT_IMPACT', 'DOWNLOADED_DOC...enalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])

In [9]:
result = loaded_model.score(X_te, y_te)
print(result)

entering in transform (1489, 26)


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


0.6272666218938885


In [10]:
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
clf_probs = loaded_model.predict_proba(X_te)
clf_probs


entering in transform (1489, 26)


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




array([[0.011, 0.521, 0.469],
       [0.139, 0.539, 0.322],
       [0.302, 0.437, 0.261],
       ...,
       [0.114, 0.405, 0.481],
       [0.315, 0.335, 0.351],
       [0.211, 0.382, 0.408]])