In [41]:
import pandas as pd
from pathlib import Path
from sklearn import metrics
import re
from abc import ABC
from typing import List
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
import random

In [53]:
predictions_dev_path = Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/56_1/inference_dev.csv')
predictions_test_path = Path('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/56_1/inference_test.csv')

predictions_dev = pd.read_csv(predictions_dev_path)
predictions_test = pd.read_csv(predictions_test_path)

predictions_dev_for_modeling = predictions_dev.copy()
predictions_dev_for_submission = predictions_dev.copy()
predictions_test_for_modeling = predictions_test.copy()
predictions_test_for_submission = predictions_test.copy()

# predictions['preds_post'] = predictions['Prediction'].copy()
# predictions.head()

In [54]:
# original score
print(metrics.f1_score(predictions_dev['Gold'], predictions_dev['Prediction'], average='weighted'), 
metrics.f1_score(predictions_test['Gold'], predictions_test['Prediction'], average='weighted'))

0.9737177343173601 0.9555477507192752


In [55]:
possible_strings_to_check = [', reaching',
 ', prompting',
 ', aiming',
 ', equating',
 ', hitting',
 ', lowering',
 ', topping',
 ', raising',
 ', converting',
 ', becoming',
 ', meeting',
 ', valuing',
 ', edging',
 ', boosting',
 ', completing',
 ', slowing',
 ', lasting',
 ', clothing',
 ', totaling',
 ', rising']

In [56]:
predictions_dev_for_modeling['feature_contains_numeric'] = predictions_dev_for_modeling['Text'].apply(lambda text: any(char.isdigit() for char in text))
predictions_dev_for_modeling['feature_contains_textual_numeric'] = predictions_dev_for_modeling['Text'].apply(lambda text: any(word in text for word in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']))
predictions_dev_for_modeling['feature_contains_causal'] = predictions_dev_for_modeling['Text'].apply(lambda text: any(word in text for word in ['as', 'since', 'because', 'cause', 'after']))
predictions_dev_for_modeling['feature_contains_percent'] = predictions_dev_for_modeling['Text'].apply(lambda text: '%' in text)
predictions_dev_for_modeling['feature_contains_currency'] = predictions_dev_for_modeling['Text'].apply(lambda text: any(word in text for word in ['$', '€', '£', 'yuan', 'Yuan', 'INR', 'inr']))
predictions_dev_for_modeling['feature_contains_comma_with_ing'] = predictions_dev_for_modeling['Text'].apply(lambda text: True if len(re.findall(""",\s([a-z]*?ing)""", text))>0 else False)
predictions_dev_for_modeling['feature_contains_specific_ings_with_comma'] = predictions_dev_for_modeling['Text'].apply(lambda text: any(substring in text for substring in possible_strings_to_check))
predictions_test_for_modeling['feature_contains_numeric'] = predictions_test_for_modeling['Text'].apply(lambda text: any(char.isdigit() for char in text))
predictions_test_for_modeling['feature_contains_textual_numeric'] = predictions_test_for_modeling['Text'].apply(lambda text: any(word in text for word in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']))
predictions_test_for_modeling['feature_contains_causal'] = predictions_test_for_modeling['Text'].apply(lambda text: any(word in text for word in ['as', 'since', 'because', 'cause', 'after']))
predictions_test_for_modeling['feature_contains_percent'] = predictions_test_for_modeling['Text'].apply(lambda text: '%' in text)
predictions_test_for_modeling['feature_contains_currency'] = predictions_test_for_modeling['Text'].apply(lambda text: any(word in text for word in ['$', '€', '£', 'yuan', 'Yuan', 'INR', 'inr']))
predictions_test_for_modeling['feature_contains_comma_with_ing'] = predictions_test_for_modeling['Text'].apply(lambda text: True if len(re.findall(""",\s([a-z]*?ing)""", text))>0 else False)
predictions_test_for_modeling['feature_contains_specific_ings_with_comma'] = predictions_test_for_modeling['Text'].apply(lambda text: any(substring in text for substring in possible_strings_to_check))

In [61]:
feature_cols_to_select = [True if 'feature_' in column_name  else False for column_name in predictions_dev_for_modeling.columns]
true_cols_to_select = [True if 'Prediction' in column_name else False for column_name in predictions_dev_for_modeling.columns]

In [62]:
clf = RandomForestClassifier(max_depth=100, 
                             n_estimators=1000, 
                             random_state=0)
clf.fit(predictions_dev_for_modeling.loc[:, feature_cols_to_select].values, predictions_dev_for_modeling.loc[:, true_cols_to_select].values.ravel())
dev_predict = clf.predict(predictions_dev_for_modeling.loc[:, feature_cols_to_select].values)
test_predict = clf.predict(predictions_test_for_modeling.loc[:, feature_cols_to_select].values)
print('val: ', f1_score(predictions_dev_for_modeling['Gold'].tolist(), dev_predict, average='weighted'))
print('test: ', f1_score(predictions_test_for_modeling['Gold'].tolist(), test_predict, average='weighted'))
# final = clf.predict(preds)

val:  0.9291052694139045
test:  0.9203910184000016


In [63]:
predictions_dev_for_submission['Prediction'] = dev_predict
predictions_test_for_submission['Prediction'] = test_predict

In [66]:
predictions_dev_for_submission.to_csv('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/61_1/inference_dev.csv', index=False)
predictions_test_for_submission.to_csv('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/61_1/inference.csv', index=False)

In [43]:
dev_predict_random = [random.choice([True, False]) for _ in range(len(predictions_dev))]
test_predict_random = [random.choice([True, False]) for _ in range(len(predictions_test))]
dev_predict_majority = [0 for _ in range(len(predictions_dev))]
test_predict_majority = [0 for _ in range(len(predictions_test))]

In [47]:
print('val: ', f1_score(predictions_dev['Gold'].tolist(), dev_predict_random, average='weighted'))
print('test: ', f1_score(predictions_test['Gold'].tolist(), test_predict_random, average='weighted'))

val:  0.6187014909229698
test:  0.6263745624297067


In [48]:
print('val: ', f1_score(predictions_dev['Gold'].tolist(), dev_predict_majority, average='weighted'))
print('test: ', f1_score(predictions_test['Gold'].tolist(), test_predict_majority, average='weighted'))

val:  0.8940220409602846
test:  0.8938958203062894


In [None]:
meta_clf.predict(df_merged_test[cols_pred].values)

In [352]:
# no numbers and percent, pred=1, change to False (AND)
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_numeric']==False) & (predictions['contains_percent']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9535042241739922

In [353]:
# no numbers (AND)
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_numeric']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9535042241739922

In [354]:
# no numbers or textual numerics
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[((predictions['contains_numeric']==False) | (predictions['contains_textual_numeric']==False)) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9135556208038943

In [355]:
# no percents
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_percent']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9331973513661947

In [356]:
# contains currency
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_currency']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9460782175319331

In [357]:
# causal
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_causal']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9389635313199106

In [358]:
# causal and numbers
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_causal']==False) & (predictions['contains_numeric']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9535042241739922

In [359]:
# comma_with_ing
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_comma_with_ing']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9227871058138696

In [360]:
# numbers and comma
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_comma_with_ing']==False) & (predictions['contains_numeric']==False) & (predictions['Prediction']==1), 'preds_post'] = 0
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9535042241739922

In [361]:
# comma_with_specific_ing
predictions['preds_post'] = predictions['Prediction'].copy()
predictions.loc[(predictions['contains_specific_ings_with_comma']==True) & (predictions['Prediction']==0), 'preds_post'] = 1
metrics.f1_score(predictions['Gold'], predictions['preds_post'], average='weighted')

0.9545387552316572

# Save Predictions

In [363]:
import numpy as np

In [364]:
original_data = pd.read_csv(Path('/media/sarthak/HDD/data_science/fnp_resources/data/task1/v2/trial.csv'), sep='; ')
original_data

  """Entry point for launching an IPython kernel.


Unnamed: 0,Index,Text,Gold
0,1.00001,Third Democratic presidential debate Septembe...,0
1,1.00002,"On the policy front, Bernie Sanders claimed hi...",0
2,1.00003,Joe Biden misrepresented recent history when h...,0
3,1.00004,Here's a look at some of the assertions in the...,0
4,1.00005,"It killed 22 people, and injured many more, we...",0
...,...,...,...
8575,370.00021,That's correct.,0
8576,370.00022,Fining a company - which in effect just fines ...,0
8577,370.00023,It was securing the bribe.,0
8578,370.00024,Disrupting Democracy: When Big Tech Takes Over...,0


In [377]:
na = original_data.loc[original_data.Text.isna()]
na['preds_post'] = 0
na

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Index,Text,Gold,preds_post
1486,53.00018,,0,0


In [380]:
predictions_p1 = predictions.iloc[:1486][['Index', 'Text', 'preds_post']]
predictions_p2 = na[['Index', 'Text', 'preds_post']]
predictions_p3 = predictions.iloc[1486:][['Index', 'Text', 'preds_post']]
predictions_concat = pd.concat([predictions_p1, predictions_p2, predictions_p3])

In [384]:
predictions_concat.rename(columns={'preds_post': 'Prediction'}, inplace=True)

In [385]:
predictions_concat

Unnamed: 0,Index,Text,Prediction
0,1.00001,Third Democratic presidential debate Septembe...,0
1,1.00002,"On the policy front, Bernie Sanders claimed hi...",0
2,1.00003,Joe Biden misrepresented recent history when h...,0
3,1.00004,Here's a look at some of the assertions in the...,0
4,1.00005,"It killed 22 people, and injured many more, we...",0
...,...,...,...
8574,370.00021,That's correct.,0
8575,370.00022,Fining a company - which in effect just fines ...,0
8576,370.00023,It was securing the bribe.,0
8577,370.00024,Disrupting Democracy: When Big Tech Takes Over...,0


In [386]:
predictions_concat.to_csv('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/46/output/best_model/inference/predictions.csv', index=False, sep=';')

In [387]:
preds_load = pd.read_csv('/media/sarthak/HDD/data_science/fnp_resources/fincausal_t1_models/46/output/best_model/inference/predictions.csv', sep=';')
preds_load

Unnamed: 0,Index,Text,Prediction
0,1.00001,Third Democratic presidential debate Septembe...,0
1,1.00002,"On the policy front, Bernie Sanders claimed hi...",0
2,1.00003,Joe Biden misrepresented recent history when h...,0
3,1.00004,Here's a look at some of the assertions in the...,0
4,1.00005,"It killed 22 people, and injured many more, we...",0
...,...,...,...
8575,370.00021,That's correct.,0
8576,370.00022,Fining a company - which in effect just fines ...,0
8577,370.00023,It was securing the bribe.,0
8578,370.00024,Disrupting Democracy: When Big Tech Takes Over...,0
