In [1]:
import pandas as pd
import json
from pathlib import Path
from typing import List
from functools import reduce
import numpy as np
from sklearn.metrics import f1_score
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from xgboost import XGBClassifier


In [2]:
import joblib

In [3]:
def ensemble(true: np.array, preds: np.array, strategy='OR'):
    final = []
    clf = None
    
    if strategy == 'OR':
        for pred in preds:
            final.append(1 if any(pred) else 0)
    
    elif strategy == 'AND':
        for pred in preds:
            final.append(1 if all(pred) else 0)
    elif strategy == 'majority':
        for pred in preds:
            final.append(1 if sum(pred)>=(len(pred)/2) else 0)
            
    elif strategy == 'blend_rf':
        clf = RandomForestClassifier(max_depth=100, 
                                     n_estimators=3, 
                                     random_state=0)
        clf.fit(preds, true.ravel())
        final = clf.predict(preds)
    elif strategy == 'blend_xgb':
        clf = XGBClassifier(n_estimators=1000,
                                       max_depth=5,
                            learning_rate=0.1,
                                       verbosity=1)
        clf.fit(preds, true.ravel())
        final = clf.predict(preds)

    return final, clf

In [4]:
def merge_cnn_preds_with_df(cnn_preds_fp, test):
    pred_df_test = test.copy()
    print(pred_df_test)
    with open(cnn_preds_fp, 'rb') as handle:
        preds = pickle.load(handle)
        pred_df_test['Prediction'] = preds
        pred_df_test['Prediction'] = pred_df_test['Prediction'].astype(int)
    return pred_df_test

In [21]:
submit = pd.read_csv(Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1/all_combined/eval.csv'))
reload_path = Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/submissions/metaclf_bertsandrulesandcnnandbertbaseuncased.joblib')

In [22]:
# code for transformers prediction
fnp_pred_dfs_test = []

file_paths = [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/57_1/output/best_model/inference_eval/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/64_1/output/best_model/inference_eval/predictions.csv'),      
    Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/67_1/output/best_model/inference_eval/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/69_1/output/best_model/inference_eval/predictions.csv')
             ]
             
"""
file_paths = [ [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/61_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/61_1/output/best_model/inference/predictions.csv')],
    [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/62_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/62_1/output/best_model/inference/predictions.csv')],
    
    [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/64_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/64_1/output/best_model/inference/predictions.csv')],
    [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/67_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/67_1/output/best_model/inference/predictions.csv')],
              [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/69_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/69_1/output/best_model/inference/predictions.csv')],
             [Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/87_1/output/best_model/inference_dev/predictions.csv'),
             Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/87_1/output/best_model/inference/predictions.csv')]]
"""
for fp in file_paths:
    fnp_pred_dfs_test.append(pd.read_csv(fp))

In [23]:
# code for CNN predictions
cnn_pred_dfs_test = []

file_paths = ['/media/sarthak/HDD/TUM/Thesis/thesis-sarthak/src/tc/experiments/fincausal_allcombined_traindev/preds_eval.pkl']

for fp in file_paths:
    cnn_pred_df_test = merge_cnn_preds_with_df(fp, submit)
    cnn_pred_dfs_test.append(cnn_pred_df_test)

          Index                                               Text  \
0       1.00001    Earn $25 per hour as an independent contractor!   
1       1.00002  Write and update curriculum for The Income Tax...   
2       1.00003  Function:  Write, update and enhance ITS curri...   
3       2.00001  September 13, 2019 Congress Speaker's Office D...   
4       2.00002  While the Speaker's office disclaimed the leak...   
...         ...                                                ...   
7381  290.00030  The regulation provides an extended list of se...   
7382  290.00031  RESPA Section 3 provides that a thing of value...   
7383  290.00032  A form used by a settlement or closing agent i...   
7384  290.00033  Featuring: Delivery 3X a week plus breaking ne...   
7385  290.00034  Featuring: Delivery 2X a week plus breaking ne...   

                                 unique_id  
0     82e9b20d-a78e-4fe0-b621-c0601113be66  
1     30ac6e7b-1ed4-4be6-8fb7-84aeb3203f6c  
2     77abe20b-2b61-4905

In [24]:
data_frames_test = fnp_pred_dfs_test
k = np.arange(len(data_frames_test)).astype(str)
df_merged_test = pd.concat([x.set_index('unique_id') for x in data_frames_test], axis=1, join='inner', keys=k)
df_merged_test.columns = df_merged_test.columns.map('_'.join)

cols_pred = [i+'_Prediction' for i in k]
df_merged_test = df_merged_test[cols_pred]

In [25]:
df_merged_test.head()

Unnamed: 0_level_0,0_Prediction,1_Prediction,2_Prediction,3_Prediction
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
82e9b20d-a78e-4fe0-b621-c0601113be66,0,0,0,1
30ac6e7b-1ed4-4be6-8fb7-84aeb3203f6c,0,0,0,0
77abe20b-2b61-4905-bf93-a2a86809df6c,0,0,0,0
89866e6a-8397-4926-8e01-a9b286b2cf83,0,0,0,0
64b52667-aaa9-4b01-b434-97316bd5e877,0,0,0,0


In [29]:
df_merged_test['Prediction'], meta_clf = ensemble(true=None, 
                                             preds=df_merged_test[cols_pred].values,
                                             strategy='majority')

In [27]:
meta_clf = joblib.load(reload_path)
df_merged_test['Prediction'] = meta_clf.predict(df_merged_test[cols_pred].values)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7'] ['f0', 'f1', 'f2', 'f3']
expected f5, f7, f6, f4 in input data

In [30]:
df_merged_test

Unnamed: 0_level_0,0_Prediction,1_Prediction,2_Prediction,3_Prediction,Prediction
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
82e9b20d-a78e-4fe0-b621-c0601113be66,0,0,0,1,0
30ac6e7b-1ed4-4be6-8fb7-84aeb3203f6c,0,0,0,0,0
77abe20b-2b61-4905-bf93-a2a86809df6c,0,0,0,0,0
89866e6a-8397-4926-8e01-a9b286b2cf83,0,0,0,0,0
64b52667-aaa9-4b01-b434-97316bd5e877,0,0,0,0,0
...,...,...,...,...,...
731bbb45-0e6e-45e4-aec6-ccb83600f01a,0,0,0,0,0
2c485f69-96ca-4483-a73a-30aed18d9b01,0,0,0,0,0
75bafbcf-5961-477d-bac7-4fdf81c68cb1,0,0,0,0,0
08590fb4-b1f4-4912-be8c-c0f707b16a69,0,0,0,0,0


In [31]:
submit_with_pred = pd.merge(submit, df_merged_test[['Prediction']], left_on='unique_id', right_on='unique_id')
submit_with_pred.head()

Unnamed: 0,Index,Text,unique_id,Prediction
0,1.00001,Earn $25 per hour as an independent contractor!,82e9b20d-a78e-4fe0-b621-c0601113be66,0
1,1.00002,Write and update curriculum for The Income Tax...,30ac6e7b-1ed4-4be6-8fb7-84aeb3203f6c,0
2,1.00003,"Function: Write, update and enhance ITS curri...",77abe20b-2b61-4905-bf93-a2a86809df6c,0
3,2.00001,"September 13, 2019 Congress Speaker's Office D...",89866e6a-8397-4926-8e01-a9b286b2cf83,0
4,2.00002,While the Speaker's office disclaimed the leak...,64b52667-aaa9-4b01-b434-97316bd5e877,0


In [32]:
submit_with_pred.shape

(7386, 4)

In [33]:
output_path = Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/143_1/output/best_model/inference_eval')
output_path.mkdir(parents=True)
submit_with_pred.to_csv(output_path / 'predictions.csv', index=False)