In [22]:
import pandas as pd
import time
import numpy as np
from sqlalchemy import create_engine
import sqlalchemy
import math
from sklearn.ensemble import RandomForestClassifier,VotingClassifier , ExtraTreesClassifier, GradientBoostingClassifier ,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score,KFold
from imblearn.combine import SMOTEENN
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, precision_score, recall_score,roc_curve, auc,make_scorer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


### 초기 설정들

- stat_test_array는 사용될 features지만, 원본에서 column이 다시 계산되어 사용될 수 있음

- set_one_hot_incoder는 초기 False로 설정 되어야 함 (추후 incode시 같은 값을 가지기 위해)

- scoring : precision이 best인 것을 gridserchCV에서 고름. refit 설정 'Precision'으로 설정 해야함

In [23]:
stat_test_array=['CO_mean','RE_mean','AD_mean','RO_mean'] # 사용할 column
set_one_hot_incoder=False
enc=OneHotEncoder(handle_unknown='ignore')
dev_db = create_engine("mysql+pymysql://*/ci_dev?charset=utf8mb4", encoding = 'utf8' ,
                       pool_size=20,pool_recycle=3600,connect_args={'connect_timeout':1000000})
scoring = {'Precision': make_scorer(precision_score)}

### train data 가져오기

In [24]:
def get_data_to_db_for_statistic(db,mean_table,label_table):
    query=(""" select A.*,B.label """+
            """ from %s A """%(mean_table)+
            """ INNER JOIN %s B """%(label_table)+
            """ on A.SOURCE_ID_1 = B.SOURCE_ID_1 and A.SOURCE_ID_2=B.SOURCE_ID_2 """+
            #""" and A.SOURCE_1=B.SOURCE_1 """+ #and A.source_2=B.source_2"""+
            """ where B.label !=2 and B.source_1!=B.source_2 """) #and B.source_1!=B.source_2
    res_df=pd.read_sql_query(query,db)
    
    return res_df

### best 알고리즘을 골라 다시 계산하여 column을 저장

In [25]:
def make_set_df_cleaning(df):
    label_col=list(df.columns)
    label_col=[x for x in label_col if x[0:2]=='CO' or x[0:2]=='RE' or x[0:2]=='AD' or x[0:2]=='RO'] 
    label_col.remove('COMPUTED_DT')

    new_label_col=[]
    for idx in label_col:
        if idx[-3:]=='_qg' or idx[-3:]=='_jr' or idx[-3:]=='cos' or idx[-3:]== '_sw' or idx[-3:]=='lcs':
            new_label_col.append(idx)

    co=[x for x in new_label_col if x[0:2]=='CO'] 
    re=[x for x in new_label_col if x[0:2]=='RE'] 
    ad=[x for x in new_label_col if x[0:2]=='AD']
    ro=[x for x in new_label_col if x[0:2]=='RO']

    co_list=list(df[co].mean(axis=1))
    re_list=list(df[re].mean(axis=1))
    ad_list=list(df[ad].mean(axis=1))
    ro_list=list(df[ro].mean(axis=1))

    res_df=df.copy()

    res_df['CO_mean']=co_list
    res_df['RE_mean']=re_list
    res_df['AD_mean']=ad_list
    res_df['RO_mean']=ro_list
    
    return res_df


### dataframe에서 x값을 추출

In [26]:
def make_set(df):
    global set_one_hot_incoder
    global stat_test_array

    if 'CO_mean' not in df.columns.values.tolist():
        df=make_set_df_cleaning(df)

    
    each_len=1

    CO_NAME_l=[]
    REP_PHONE_l=[]
    ADDR_l=[]
    ROAD_ADDR_l=[]

    source_o=[]
    
    for idx in list(df['pair_source']):
        source_o.append([idx])


    if set_one_hot_incoder==False:
        enc.fit(source_o)
        set_one_hot_incoder=True

    source=enc.transform(source_o).toarray()

    
    for idx in stat_test_array:
        
        if idx[0:2]=='CO':
            CO_NAME_l.append(list(df[idx]))
        if idx[0:2]=='RE':
            REP_PHONE_l.append(list(df[idx]))
        if idx[0:2]=='AD':
            ADDR_l.append(list(df[idx]))
        if idx[0:2]=='RO':
            ROAD_ADDR_l.append(list(df[idx]))


    res_list=[]
    for idx in range(0,len(CO_NAME_l[0])):

        add=[]

        for k in range(0,each_len):
            add.append(CO_NAME_l[k][idx])
        
        for k in range(0,each_len):
            add.append(REP_PHONE_l[k][idx])

        if ROAD_ADDR_l[0][idx] is None:
            for k in range(0,each_len):     
               add.append(ADDR_l[k][idx])
        elif math.isnan(ROAD_ADDR_l[0][idx]) :
            for k in range(0,each_len):     
               add.append(ADDR_l[k][idx])
        else:
            aver_a=0
            aver_r=0
            for k in range(0,each_len):
                aver_a=aver_a+ADDR_l[k][idx]
                aver_r=aver_r+ROAD_ADDR_l[k][idx]        
            if aver_a>aver_r:
                for k in range(0,each_len):
                    add.append(ADDR_l[k][idx])
            else:
                for k in range(0,each_len):
                    add.append(ROAD_ADDR_l[k][idx])
        
        for k in source[idx]:
            add.append(k)

        res_list.append(add)

   
    for idx in range(0,len(res_list)):
        for i in range(0,len(res_list[idx])): 
            if math.isnan(res_list[idx][i]):
                res_list[idx][i]=0
            

    return res_list


### cross vailidation 후 score 출력, 

### 75% data로 train 후 model, x_test, y_test 반환

In [27]:
def statistic_set(model,df):
    start_time=time.time()

    input_y= list(df['label'])
    input_x= make_set(df)

    input_x,test_x,input_y,test_y=train_test_split(input_x,input_y,
                                 test_size=0.25,stratify=input_y,random_state=43)
    

    
    print('-----------------------')
    print('CV result')
    
    cv_score=[]
    cv_precision=[]
    cv_recall=[]
    
    
    cv= KFold(5,shuffle=True,random_state=43)
    for i,(idx_train,idx_test) in enumerate(cv.split(input_x,input_y)):
        x_train_list=[]
        y_train_list=[]
        x_test_list=[]
        y_test_list=[]

        for idx in idx_train:
            x_train_list.append(input_x[idx])
            y_train_list.append(input_y[idx])

        for idx in idx_test:
            x_test_list.append(input_x[idx])
            y_test_list.append(input_y[idx])
            
        x_train_list,y_train_list=SMOTEENN(random_state=0).fit_sample(x_train_list,y_train_list)

        clf=model.fit(x_train_list,y_train_list)

        add=model_scores(clf,x_test_list,y_test_list)
            
        cv_score.append(add[0])
        cv_precision.append(add[1])
        cv_recall.append(add[2])
    
    cv_score=np.array(cv_score)
    cv_precision=np.array(cv_precision)
    cv_recall=np.array(cv_recall)

    
    print('score : %0.3f (+/- %0.3f)'%(cv_score.mean(),cv_score.std()*2))
    print('precision : %0.3f (+/- %0.3f)'%(cv_precision.mean(),cv_precision.std()*2))
    print('score : %0.3f (+/- %0.3f)'%(cv_recall.mean(),cv_recall.std()*2))
    
    
    input_x,input_y = SMOTEENN(random_state=0).fit_sample(input_x,input_y)

    fin_clf=model.fit(input_x,input_y)
    fin_score=fin_clf.score(test_x,test_y)

    print('final_score')
    print(fin_score)
    res_time=time.time()-start_time
    print('processing time : %0.2f'%(res_time))
    return fin_clf,test_x,test_y


### model의 score 반환

In [28]:
def model_scores(model,test_x,test_y):
    predict = model.predict(test_x)
    
    score = model.score(test_x,test_y)
    precision = precision_score(test_y,predict)
    recall = recall_score(test_y,predict)
    
    return score,precision,recall

In [29]:
def print_scores(scores):
    print('score : %0.3f'%(scores[0]))
    print('precision : %0.3f'%(scores[1]))
    print('recall : %0.3f'%(scores[2]))

### model과 hyper parameter 설정

In [30]:
def logistic(df):
    
    model= LogisticRegression(random_state=0,solver='liblinear')
    
    return statistic_set(model,df)

In [31]:
def random_forest(df):

    model = RandomForestClassifier(bootstrap=True,class_weight=None,max_depth=100,
                                    n_estimators=2,random_state=43)
    
    return statistic_set(model,df)


In [32]:
def GBC(df):
    model = GradientBoostingClassifier(n_estimators=200,learning_rate=1
                                            ,max_depth=1,random_state=43)
    
    return statistic_set(model,df)

In [33]:
def ENSE(df):
    model1=LogisticRegression(random_state=43)
    model2=QuadraticDiscriminantAnalysis()
    model3=GaussianNB()
    ensemble = VotingClassifier(estimators=[('lr', model1), 
                                            ('qda', model2), 
                                            ('gnb', model3)], 
                                            voting='soft')
    return statistic_set(ensemble,df)

In [34]:
def ECLF(df):
    
    rf = RandomForestClassifier(bootstrap=True,class_weight=None, max_depth=100, n_estimators=2,random_state=43)
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=43)
    knn = KNeighborsClassifier()
    svc = SVC(probability=True)
    eclf = VotingClassifier(estimators=[('Random Forests', rf), 
                            ('Extra Trees', et), ('KNeighbors', knn), 
                            ('SVC', svc)], voting='soft')
    
    return statistic_set(eclf,df)


In [35]:
def KNN_RF(df):
    rf = RandomForestClassifier(bootstrap=False,max_depth=200,n_estimators=2)
    knn=KNeighborsClassifier(n_neighbors=500)
    knn_rf=VotingClassifier(estimators=[('Random Forests',rf),('KNeigbors',knn)],voting='soft')
    return statistic_set(knn_rf,df)


### warning 제거, 다시 띄울려면 default로

In [36]:
import warnings

In [37]:
warnings.filterwarnings('ignore') # or set default

In [38]:
df=get_data_to_db_for_statistic(dev_db,'ci_dev.SIM_FEATURES_test','ci_dev.features_lable')

In [39]:
len(df)

3296

In [46]:
def pair_source(x):
    if x=='intersecting_set':
        return 'coname_SN'
    else:
        return x

In [47]:
df['pair_source']=df['pair_source'].apply(pair_source)

In [53]:
df['pair_source'].unique()

array(['coname_SN', 'phonenum_B'], dtype=object)

### model score 출력

In [54]:
print('Logistic')
log_res=logistic(df)
print_scores(model_scores(log_res[0],log_res[1],log_res[2]))
print('\nRandom Forest')
rf_res=random_forest(df)
print_scores(model_scores(rf_res[0],rf_res[1],rf_res[2]))
print('\nGBC')
gbc_res=GBC(df)
print_scores(model_scores(gbc_res[0],gbc_res[1],gbc_res[2]))
print('\nLogistic,QDA,GaussianNB')
LQG_res = ENSE(df)
print_scores(model_scores(LQG_res[0],LQG_res[1],LQG_res[2]))
print('\nRandom Forest, ExtraTree, Kneigbors')
REK_res = ECLF(df)
print_scores(model_scores(REK_res[0],REK_res[1],REK_res[2]))
print('\nrandom forest, Kneigbors')
RK_res = KNN_RF(df)
print_scores(model_scores(RK_res[0],RK_res[1],RK_res[2]))

Logistic
-----------------------
CV result
score : 0.952 (+/- 0.015)
precision : 0.976 (+/- 0.011)
score : 0.953 (+/- 0.034)
final_score
0.9575242718446602
processing time : 0.35
score : 0.958
precision : 0.982
recall : 0.956

Random Forest
-----------------------
CV result
score : 0.952 (+/- 0.021)
precision : 0.981 (+/- 0.017)
score : 0.948 (+/- 0.031)
final_score
0.9599514563106796
processing time : 0.33
score : 0.960
precision : 0.984
recall : 0.958

GBC
-----------------------
CV result
score : 0.962 (+/- 0.010)
precision : 0.975 (+/- 0.015)
score : 0.970 (+/- 0.016)
final_score
0.9660194174757282
processing time : 1.26
score : 0.966
precision : 0.987
recall : 0.963

Logistic,QDA,GaussianNB
-----------------------
CV result
score : 0.923 (+/- 0.061)
precision : 0.975 (+/- 0.017)
score : 0.912 (+/- 0.102)
final_score
0.933252427184466
processing time : 0.46
score : 0.933
precision : 0.981
recall : 0.921

Random Forest, ExtraTree, Kneigbors
-----------------------
CV result
score : 

### predict를 계산한뒤 df에 column을 추가해 df 반횐

In [55]:
def predict_else_df(model,df):
    
    test_x= make_set(df)
    predict = model.predict(test_x)
    df_res=df.copy()
    
    df_res['predict']=predict
    
    return df_res


### predict할 data 가져오기 (TODO)

In [56]:
def get_data_at_db_for_predict_else(db,mean_table,label_table):
    query=(""" select A.*,B.label """+
            """ from %s A """%(mean_table)+
            """ left JOIN %s B """%(label_table)+
            """ on A.SOURCE_ID_1 = B.SOURCE_ID_1 and A.SOURCE_ID_2=B.SOURCE_ID_2 """+
            #""" and A.SOURCE_1=B.SOURCE_1 """+ #and A.source_2=B.source_2"""+
            """ where B.source_id_1 is null """) #and B.source_1!=B.source_2
    res_df=pd.read_sql_query(query,db)
    
    return res_df

In [58]:
pred_df=get_data_at_db_for_predict_else(dev_db,'ci_dev.SIM_FEATURES_test','ci_dev.features_lable')

In [110]:
pred_df['SOURCE_2'].unique()

array(['PROCESSED_CARD_LOGIN_recleaning'], dtype=object)

In [114]:
pred_df['pair_source']=pred_df['pair_source'].apply(pair_source)

In [115]:
len(pred_df)

1174783

In [116]:
pred_df['pair_source'].unique()

array(['coname_SN', 'phonenum_B'], dtype=object)

In [117]:
res_df=predict_else_df(REK_res[0],pred_df)

### 저장될 column 설정

In [118]:
res_df=res_df[['SOURCE_ID_1','SOURCE_ID_2','pair_source','predict']]

In [119]:
import datetime

### pickle로 저장!!!!!

In [120]:
res_df.to_pickle('../predicted_df/pred_{}.pkl'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")))