In [217]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.ticker as mticker
import torch
import plotly.graph_objects as go
import plotly.express as px


In [223]:
import dataframe_image as dfi

In [132]:
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
from joblib import load, dump

In [83]:
sns.set_theme()

# EDA with NLP

In [100]:
df = pd.read_csv("Data/FinancialNewsData.csv", encoding="Windows-1252", names=["label", "headline"])

In [101]:
pos = load("Objects/Data/pos.joblib")

In [102]:
pos['label'] = df['label']

## Class Imbalance

In [416]:
fig = go.Figure()

vals = np.round(pos['label'].value_counts(normalize=True).sort_index(), 2)

fig.add_trace(go.Histogram(
    x=pos['label'].sort_values(),
    histnorm='',
    text=vals
)


)
px.histogram(pos.sort_values(by='label'), x='label', histnorm='probability density')

fig.update_layout(
    title="Class Imbalance",
    title_x=0.5,
    font= dict(size=16),
    yaxis= dict(title="Probability"),
    xaxis= dict(title="Sentiment"),
    height=600,
    width=850
)

## BOW

In [156]:
example_headline = df.sample(1, random_state=15).iloc[0].headline
example_label = df.sample(1, random_state=15).iloc[0].label

In [169]:
example_headline

'We went to the market with yield guidance of the 7.25 % area , which gave us the flexibility to go up or down by 1-8th .'

In [157]:
cv = CountVectorizer()

example_cv = cv.fit_transform([example_headline])

In [179]:
example_df = pd.DataFrame(example_cv.toarray(), columns=cv.get_feature_names(), index=["Example_Headline"])
example_df['loss'] = 0
example_df['increase'] = 0
example_df['label'] = example_label

In [224]:
table_plot = example_df[["we", "went", "to", "the", "market", "loss", "increase", "label"]]

In [225]:
dfi.export(table_plot, "example_df.png")

[1104/122936.724427:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/: Operation not permitted (1)
[1104/122936.737734:ERROR:file_io.cc(91)] ReadExactly: expected 8, observed 0
[1104/122936.740317:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/: Operation not permitted (1)
[1104/122937.988142:INFO:headless_shell.cc(653)] Written to file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/tmp56jvfsgn/temp.png.


## "said"

In [429]:
non_said_df = pos[pos['said_VERB'] == 0].sort_values(by='label')
said_df = pos[pos['said_VERB'] > 0].sort_values(by='label')

non_said_vals = non_said_df['label'].value_counts(normalize=True).sort_index()
said_vals = said_df['label'].value_counts(normalize=True).sort_index()

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=non_said_df.label,
    histnorm='probability density',
    name="No",
    text=np.round(non_said_vals, 2)
))

fig.add_trace(go.Histogram(
    x=said_df.label,
    histnorm='probability density',
    name="Yes",
    text=np.round(said_vals, 2)
))


fig.update_layout(
    title="Label Probabilty Given the Occurance of 'said'",
    title_x=0.5,
    legend=dict(title="Contains 'said'"),
    barmode='group',
    yaxis=dict(title="Probability"),
    height=600,
    width=850
    
)
fig.show(
   
)

## Bayesian statistics

What is the probability of obeserving a positive news article given that the article contains the word said? Since we have class imbalence, we need to use Bayes Theorem to answer this question

In [435]:
# P_A - probability of a positive sentiment with no evidence
positive_sentiments = pos[pos['label'] == 'positive']
p_a = positive_sentiments.shape[0] / pos.shape[0]

In [436]:
# Probability of said appearing with no evidence P_B
p_b = pos[pos['said_VERB'] > 0].shape[0] / pos.shape[0]

In [437]:
# Propbability of said given a positive sentiment P(B|A) said given a positive sentiment
positive_and_said = positive_sentiments[positive_sentiments['said_VERB'] > 0]
b_given_a = positive_and_said.shape[0] / positive_sentiments.shape[0] 

In [438]:
(b_given_a * p_a) / p_b

0.42357274401473294

## Steakholders Insights

In order for this to work in the real world, we would need our training sample of headlines to be representative of new headlines coming in from our sources.

In [417]:
# Create group by objects for contains "said" and does not contain "said"
said_group = pos[pos['said_VERB'] > 0].groupby('label')
non_said_group = pos[pos['said_VERB'] == 0].groupby('label')
said_totals = said_group[['label']].agg('count')
non_said_totals = non_said_group[['label']].agg('count')

In [339]:
# converting counts to probabilities
non_said_totals['probablities'] = non_said_totals['label'] / non_said_totals.sum().label * 100
said_totals['probablities'] = said_totals['label'] / said_totals.sum().label * 100

In [340]:
# setting multi-index column
said_totals.columns = pd.Index([('contains said', 'counts'), ('contains said', 'probabilities')])

non_said_totals.columns = pd.Index([("doesn't contain said", 'counts'), ("doesn't contain said", 'probabilities')])

In [341]:
said_table = pd.concat([non_said_totals, said_totals], axis=1)

In [342]:
said_table

Unnamed: 0_level_0,contains said,contains said,doesn't contain said,doesn't contain said
Unnamed: 0_level_1,counts,probabilities,counts,probabilities
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
negative,77,14.180479,527,12.247269
neutral,236,43.462247,2643,61.422264
positive,230,42.357274,1133,26.330467


In [442]:
no_profit_df = pos[pos['profit_NOUN'] == 0].sort_values(by='label')
profit_df = pos[pos['profit_NOUN'] > 0].sort_values(by='label')

non_profit_vals = no_profit_df['label'].value_counts(normalize=True).sort_index()
profit_vals = profit_df['label'].value_counts(normalize=True).sort_index()

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=no_profit_df.label,
    histnorm='probability density',
    name="No",
    text=np.round(non_profit_vals, 2)
))

fig.add_trace(go.Histogram(
    x=profit_df.label,
    histnorm='probability density',
    name="Yes",
    text=np.round(profit_vals, 2)
))


fig.update_layout(
    title="Sentiment Probabilty Given 'profit'",
    title_x=0.5,
    legend=dict(title="Contains 'profit'"),
    barmode='group',
    yaxis=dict(title="Probability"),
    height=600,
    width=850
)

fig.show()

In [347]:
# Create group by objects for contains "profit" and does not contain "said"
increase_group = pos[pos['increase_VERB'] > 0].groupby('label')
non_increase_group = pos[pos['increase_VERB'] == 0].groupby('label')
increase_totals = increase_group[['label']].agg('count')
non_increase_totals = non_increase_group[['label']].agg('count')

# converting counts to probabilities
non_increase_totals['probablities'] = non_increase_totals['label'] / non_increase_totals.sum().label * 100
increase_totals['probablities'] = increase_totals['label'] / increase_totals.sum().label * 100

# setting multi-index column
increase_totals.columns = pd.Index([('contains increase', 'counts'), ('contains increase', 'probabilities')])

non_increase_totals.columns = pd.Index([("doesn't contain increase", 'counts'), ("doesn't contain increase", 'probabilities')])

increase_table = pd.concat([increase_totals, non_increase_totals], axis=1)

In [348]:
increase_table

Unnamed: 0_level_0,contains increase,contains increase,doesn't contain increase,doesn't contain increase
Unnamed: 0_level_1,counts,probabilities,counts,probabilities
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
neutral,10.0,17.857143,2869,59.895616
positive,46.0,82.142857,1317,27.494781
negative,,,604,12.609603


In [444]:
no_increase_df = pos[pos['increase_VERB'] == 0].sort_values(by='label')
increase_df = pos[pos['increase_VERB'] > 0].sort_values(by='label')

no_increase_vals = no_increase_df['label'].value_counts(normalize=True).sort_index()
increase_vals = increase_df['label'].value_counts(normalize=True).sort_index()

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=no_increase_df.label,
    histnorm='probability density',
    name="No",
    text=np.round(no_increase_vals, 2)
))

fig.add_trace(go.Histogram(
    x=increase_df.label,
    histnorm='probability density',
    name="Yes",
    text=np.round(increase_vals, 2)
))


fig.update_layout(
    title="Sentiment Probabilty Given 'increase'",
    title_x=0.5,
    legend=dict(title="Contains 'increase'"),
    barmode='group',
    yaxis=dict(title="Probability"), 
    height=600,
    width=850
)
fig.show()

In [357]:
# Create group by objects for contains "profit" and does not contain "said"
profit_group = pos[pos['profit_NOUN'] > 0].groupby('label')
non_profit_group = pos[pos['profit_NOUN'] == 0].groupby('label')
profit_totals = profit_group[['label']].agg('count')
non_profit_totals = non_profit_group[['label']].agg('count')

# converting counts to probabilities
non_profit_totals['probablities'] = non_profit_totals['label'] / non_profit_totals.sum().label * 100
profit_totals['probablities'] = profit_totals['label'] / profit_totals.sum().label * 100

# setting multi-index column
profit_totals.columns = pd.Index([('contains profit', 'counts'), ('contains profit', 'probabilities')])

non_profit_totals.columns = pd.Index([("doesn't contain profit", 'counts'), ("doesn't contain profit", 'probabilities')])

profit_table = pd.concat([profit_totals, non_profit_totals], axis=1)

In [388]:
s = profit_table.style.format(precision=0, na_rep='MISSING', thousands=",",
                formatter={('contains profit', 'probabilities'): lambda x: "{:.1f} %".format(x),
                           ("doesn't contain profit", 'probabilities'): lambda x: "{:,.1f} %".format(x)
                          })

In [389]:
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000066; color: white;'
}

# Styling Pandas Tables

This is just a demonstration of how to style a table if you want to show tabular data in a power point or presentation

In [512]:
model_log = torch.load("Objects/Models/model_log.pt")

In [517]:
actual = pd.Index(["negative", "neutral", "positive"], name="actual:")
predicted = pd.Index(["negative", "neutral", "positive"], name="predicted:")

logistic_matrix = model_log[0]['confusion_matrix']

logistic_confusion_df = pd.DataFrame(logistic_matrix, columns=predicted, index=actual)

logistic_style = logistic_confusion_df.style.format(precision=0, na_rep='MISSING', thousands=",",
                formatter={})

logistic_style.set_table_styles([
    {'selector': '.true', 'props': 'background-color: #e6ffe6;'},
    {'selector': '.false', 'props': 'background-color: #ffe6e6;'},
    {'selector': 'th:not(.index_name)', 'props': 'background-color: #000066; color: white;'}
])

cell_color = pd.DataFrame([
    ['true', 'false', 'false'],
    ['false', 'true', 'false'],
    ['false', 'false', 'true']
], index=actual, columns=predicted)

logistic_style.set_td_classes(cell_color)

predicted:,negative,neutral,positive
actual:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,59,10,6
neutral,29,446,87
positive,10,39,160


In [518]:
dfi.export(logistic_style, "BOW_performance.png")

[1105/111059.172618:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/: Operation not permitted (1)
[1105/111059.184047:ERROR:file_io.cc(91)] ReadExactly: expected 8, observed 0
[1105/111059.186103:ERROR:xattr.cc(63)] setxattr org.chromium.crashpad.database.initialized on file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/: Operation not permitted (1)
[1105/111100.769641:INFO:headless_shell.cc(653)] Written to file /var/folders/0q/2w6362bn3v11r3hw5198hcvw0000gn/T/tmpk5icr3sa/temp.png.
