In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas() 
import numpy as np

from senti_dd_construction import create_polar_sentences, assign_direction_dependency_type, get_preprocessed_nouns

In [2]:
ds50_filepath = '/media/dmlab/My Passport/DATA/BenchmarkDataset/FinancialPhraseBank-v1.0/Sentenes_FourAgree.csv'

In [3]:
df = pd.read_csv(ds50_filepath)
df.columns = ['headline', 'label', 'agreement']
df = df[df['agreement']=='50']
df

Unnamed: 0,headline,label,agreement
0,according to gran the company has no plans to ...,neutral,50
1,technopolis plans to develop in stages an area...,neutral,50
2,the international electronic industry company ...,negative,50
3,with the new production plant the company woul...,positive,50
4,according to the company s updated strategy fo...,positive,50
...,...,...,...
4830,london marketwatch share prices ended lower in...,negative,50
4831,rinkuskiai s beer sales fell by 6 5 per cent t...,neutral,50
4832,operating profit fell to eur 35 4 mn from eur ...,negative,50
4833,net sales of the paper segment decreased to eu...,negative,50


In [4]:
# use only polar sentences
df = create_polar_sentences(df)
df

Unnamed: 0,headline,label,agreement
2,the international electronic industry company ...,negative,50
3,with the new production plant the company woul...,positive,50
4,according to the company s updated strategy fo...,positive,50
5,financing of aspocomp s growth aspocomp is agg...,positive,50
6,for the last quarter of 2010 componenta s net ...,positive,50
...,...,...,...
4829,helsinki thomson financial shares in cargotec ...,negative,50
4830,london marketwatch share prices ended lower in...,negative,50
4832,operating profit fell to eur 35 4 mn from eur ...,negative,50
4833,net sales of the paper segment decreased to eu...,negative,50


In [5]:
# proportional, inversely proportional
df['direction_dependency'] = df.progress_apply(lambda x: assign_direction_dependency_type(x['headline'], x['label']), axis=1)
df

100%|██████████| 1966/1966 [00:39<00:00, 49.93it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,headline,label,agreement,direction_dependency
2,the international electronic industry company ...,negative,50,
3,with the new production plant the company woul...,positive,50,proportional
4,according to the company s updated strategy fo...,positive,50,
5,financing of aspocomp s growth aspocomp is agg...,positive,50,
6,for the last quarter of 2010 componenta s net ...,positive,50,proportional
...,...,...,...,...
4829,helsinki thomson financial shares in cargotec ...,negative,50,proportional
4830,london marketwatch share prices ended lower in...,negative,50,inversely_proportional
4832,operating profit fell to eur 35 4 mn from eur ...,negative,50,
4833,net sales of the paper segment decreased to eu...,negative,50,


In [6]:
df.dropna(subset=['direction_dependency'], inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,headline,label,agreement,direction_dependency
3,with the new production plant the company woul...,positive,50,proportional
6,for the last quarter of 2010 componenta s net ...,positive,50,proportional
7,in the third quarter of 2010 net sales increas...,positive,50,proportional
8,operating profit rose to eur 13 1 mn from eur ...,positive,50,proportional
9,operating profit totalled eur 21 1 mn up from ...,positive,50,proportional
...,...,...,...,...
4827,the company said that its comparable operating...,negative,50,proportional
4828,operating result for the 12 month period decre...,negative,50,proportional
4829,helsinki thomson financial shares in cargotec ...,negative,50,proportional
4830,london marketwatch share prices ended lower in...,negative,50,inversely_proportional


In [7]:
for dependency in df['direction_dependency'].unique():
    print(dependency, len(df[df['direction_dependency']==dependency]))

proportional 691
inversely_proportional 28


In [8]:
df['nouns'] = df['headline'].apply(lambda x: get_preprocessed_nouns(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# Calculation examples

In [34]:
w = 'profit'

temp = df[df['nouns'].apply(lambda x: w in x)]
n_w = len(temp)
print('# sentences containing "{}" = {}'.format(w, n_w))

n_w_tp = len(temp[temp['direction_dependency']=='proportional'])
print('# proportional type sentences containing "{}" = {}'.format(w, n_w_tp))
n_w_ti = len(temp[temp['direction_dependency']=='inversely_proportional'])
print('# inversely proportional type sentences containing "{}" = {}'.format(w, n_w_ti))

n_total = len(df)
print('# total sentences = {}'.format(n_total))

n_tp = len(df[df['direction_dependency']=='proportional'])
print('# proportional type sentences sentences = {}'.format(n_tp))
n_ti = len(df[df['direction_dependency']=='inversely_proportional'])
print('# inversely proportional type sentences sentences = {}'.format(n_ti))

print('===\n')
print('p(w)={:.2f}'.format(n_w/n_total))
print('p(w,t_p)={:.2f}'.format(n_w_tp/n_total))
print('p(w,t_i)={:.2f}'.format(n_w_ti/n_total))

print('p(t_p)={:.2f}'.format(n_tp/n_total))
print('p(t_i)={:.2f}'.format(n_ti/n_total))

print('PMI(w, "proportional")={:.2f}'.format(np.log2((n_w_tp/n_total)/((n_w/n_total)*(n_tp/n_total)))))
print('PMI(w, "inversely proportional")={:.2f}'.format(np.log2((n_w_ti/n_total)/((n_w/n_total)*(n_ti/n_total)))))

# sentences containing "profit" = 217
# proportional type sentences containing "profit" = 216
# inversely proportional type sentences containing "profit" = 1
# total sentences = 719
# proportional type sentences sentences = 691
# inversely proportional type sentences sentences = 28
===

p(w)=0.30
p(w,t_p)=0.30
p(w,t_i)=0.00
p(t_p)=0.96
p(t_i)=0.04
PMI(w, "proportional")=0.05
PMI(w, "inversely proportional")=-3.08


In [35]:
w = 'cost'

temp = df[df['nouns'].apply(lambda x: w in x)]
n_w = len(temp)
print('# sentences containing "{}" = {}'.format(w, n_w))

n_w_tp = len(temp[temp['direction_dependency']=='proportional'])
print('# proportional type sentences containing "{}" = {}'.format(w, n_w_tp))
n_w_ti = len(temp[temp['direction_dependency']=='inversely_proportional'])
print('# inversely proportional type sentences containing "{}" = {}'.format(w, n_w_ti))

n_total = len(df)
print('# total sentences = {}'.format(n_total))

n_tp = len(df[df['direction_dependency']=='proportional'])
print('# proportional type sentences sentences = {}'.format(n_tp))
n_ti = len(df[df['direction_dependency']=='inversely_proportional'])
print('# inversely proportional type sentences sentences = {}'.format(n_ti))

print('===\n')
print('p(w)={:.2f}'.format(n_w/n_total))
print('p(w,t_p)={:.2f}'.format(n_w_tp/n_total))
print('p(w,t_i)={:.2f}'.format(n_w_ti/n_total))

print('p(t_p)={:.2f}'.format(n_tp/n_total))
print('p(t_i)={:.2f}'.format(n_ti/n_total))

print('PMI(w, "proportional")={:.2f}'.format(np.log2((n_w_tp/n_total)/((n_w/n_total)*(n_tp/n_total)))))
print('PMI(w, "inversely proportional")={:.2f}'.format(np.log2((n_w_ti/n_total)/((n_w/n_total)*(n_ti/n_total)))))

# sentences containing "cost" = 16
# proportional type sentences containing "cost" = 8
# inversely proportional type sentences containing "cost" = 8
# total sentences = 719
# proportional type sentences sentences = 691
# inversely proportional type sentences sentences = 28
===

p(w)=0.02
p(w,t_p)=0.01
p(w,t_i)=0.01
p(t_p)=0.96
p(t_i)=0.04
PMI(w, "proportional")=-0.94
PMI(w, "inversely proportional")=3.68
