### Perform analysis on the sentences itself (those for which we have labels)


In [27]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from collections import Counter
import numpy as np

In [2]:
os.getcwd()

'C:\\Users\\basti\\PycharmProjects\\ai_thesis\\data_analysis'

In [3]:
sample_200 = pd.read_csv('../files/datasets/labeled/l01_reuters_sample200.csv')
sample_800 = pd.read_csv('../files/datasets/labeled/l02_reuters_sample800.csv')

In [4]:
labeled_sentences = pd.concat([sample_200, sample_800])
labeled_sentences.drop('Unnamed: 0', axis=1)

Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time
0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,
1,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,
2,Dozens of companies in the sector merged in th...,2540,615,0.50,0.70,True,False,0.87,,,
3,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,
4,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,
...,...,...,...,...,...,...,...,...,...,...,...
795,OTHER BIDDERS A Pfizer move on As...,109689,16753,0.58,0.64,True,False,,True,504.0,2022-08-25T09:40:51.0593080
796,"The driver of a 2003 CTS, in a co...",109823,16770,0.32,0.55,True,False,,True,263.0,2022-06-23T09:57:25.3611610
797,Barra also met with accident victims' familie...,110083,16907,0.39,0.58,True,False,,True,372.0,2022-08-25T09:26:12.3074480
798,The additional 14 deaths occurred after the 13...,110089,16907,0.23,0.69,True,False,,True,416.0,2022-07-19T16:17:36.8375170


In [5]:
# load spacy model
nlp = spacy.load('en_core_web_sm')

In [6]:
labeled_sentences['sentence_nlp'] = labeled_sentences['sentence'].apply(nlp)

In [9]:
# add number of tokens
labeled_sentences['num_tokens'] = labeled_sentences['sentence_nlp'].apply(len)
labeled_sentences.head()

Unnamed: 0.1,Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time,sentence_nlp,num_tokens
0,0.0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,,"(The, automaker, also, said, it, is, consideri...",21
1,1.0,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,,"(It, fell, 12, percent, last, year, .)",7
2,2.0,Dozens of companies in the sector merged in th...,2540,615,0.5,0.7,True,False,0.87,,,,"(Dozens, of, companies, in, the, sector, merge...",23
3,3.0,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,,"(Annual, revenues, for, the, government, IT, b...",31
4,4.0,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,,"(The, Swedish, crown, rose, versus, the, dolla...",39


In [13]:
def count_words(tweet):
  word_frequencies = Counter()
  words = []
  for token in tweet:
    if not token.is_punct:
      words.append(token.text)
  word_frequencies.update(words)
  return word_frequencies

In [14]:
labeled_sentences['word_dict'] = labeled_sentences['sentence_nlp'].apply(count_words)
labeled_sentences.head()

Unnamed: 0.1,Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time,sentence_nlp,num_tokens,word_dict
0,0.0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,,"(The, automaker, also, said, it, is, consideri...",21,"{'The': 1, 'automaker': 1, 'also': 1, 'said': ..."
1,1.0,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,,"(It, fell, 12, percent, last, year, .)",7,"{'It': 1, 'fell': 1, '12': 1, 'percent': 1, 'l..."
2,2.0,Dozens of companies in the sector merged in th...,2540,615,0.5,0.7,True,False,0.87,,,,"(Dozens, of, companies, in, the, sector, merge...",23,"{'Dozens': 1, 'of': 2, 'companies': 1, 'in': 2..."
3,3.0,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,,"(Annual, revenues, for, the, government, IT, b...",31,"{'Annual': 1, 'revenues': 1, 'for': 1, 'the': ..."
4,4.0,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,,"(The, Swedish, crown, rose, versus, the, dolla...",39,"{'The': 1, 'Swedish': 1, 'crown': 1, 'rose': 1..."


In [16]:
def total_wc(word_dict):
  return sum(word_dict.values())

In [17]:
labeled_sentences['num_words'] = labeled_sentences['word_dict'].apply(total_wc)
labeled_sentences.head()

Unnamed: 0.1,Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time,sentence_nlp,num_tokens,word_dict,num_words
0,0.0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,,"(The, automaker, also, said, it, is, consideri...",21,"{'The': 1, 'automaker': 1, 'also': 1, 'said': ...",19
1,1.0,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,,"(It, fell, 12, percent, last, year, .)",7,"{'It': 1, 'fell': 1, '12': 1, 'percent': 1, 'l...",6
2,2.0,Dozens of companies in the sector merged in th...,2540,615,0.5,0.7,True,False,0.87,,,,"(Dozens, of, companies, in, the, sector, merge...",23,"{'Dozens': 1, 'of': 2, 'companies': 1, 'in': 2...",22
3,3.0,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,,"(Annual, revenues, for, the, government, IT, b...",31,"{'Annual': 1, 'revenues': 1, 'for': 1, 'the': ...",28
4,4.0,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,,"(The, Swedish, crown, rose, versus, the, dolla...",39,"{'The': 1, 'Swedish': 1, 'crown': 1, 'rose': 1...",36


In [18]:
def total_types(word_dict):
  return len(word_dict.keys())

In [19]:
labeled_sentences['num_types'] = labeled_sentences['word_dict'].apply(total_types)
labeled_sentences.head()

Unnamed: 0.1,Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time,sentence_nlp,num_tokens,word_dict,num_words,num_types
0,0.0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,,"(The, automaker, also, said, it, is, consideri...",21,"{'The': 1, 'automaker': 1, 'also': 1, 'said': ...",19,19
1,1.0,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,,"(It, fell, 12, percent, last, year, .)",7,"{'It': 1, 'fell': 1, '12': 1, 'percent': 1, 'l...",6,6
2,2.0,Dozens of companies in the sector merged in th...,2540,615,0.5,0.7,True,False,0.87,,,,"(Dozens, of, companies, in, the, sector, merge...",23,"{'Dozens': 1, 'of': 2, 'companies': 1, 'in': 2...",22,17
3,3.0,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,,"(Annual, revenues, for, the, government, IT, b...",31,"{'Annual': 1, 'revenues': 1, 'for': 1, 'the': ...",28,22
4,4.0,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,,"(The, Swedish, crown, rose, versus, the, dolla...",39,"{'The': 1, 'Swedish': 1, 'crown': 1, 'rose': 1...",36,35


In [20]:
# compute average word length
def avg_word_length(word_dict):
  total_word_length = sum([len(word) for word in word_dict.keys()])
  return total_word_length/sum(word_dict.values())

In [21]:
labeled_sentences['avg_word_length'] = labeled_sentences['word_dict'].apply(avg_word_length)
labeled_sentences.head()

Unnamed: 0.1,Unnamed: 0,sentence,id,article_id,valence,arousal,is_annotated,is_miscellaneous,rank_value,is_drafted,rank_idx,draft_time,sentence_nlp,num_tokens,word_dict,num_words,num_types,avg_word_length
0,0.0,The automaker also said it is considering intr...,186,43,0.64,0.51,True,False,0.01,,,,"(The, automaker, also, said, it, is, consideri...",21,"{'The': 1, 'automaker': 1, 'also': 1, 'said': ...",19,19,5.315789
1,1.0,It fell 12 percent last year.,274,63,0.39,0.49,True,False,0.66,,,,"(It, fell, 12, percent, last, year, .)",7,"{'It': 1, 'fell': 1, '12': 1, 'percent': 1, 'l...",6,6,3.833333
2,2.0,Dozens of companies in the sector merged in th...,2540,615,0.5,0.7,True,False,0.87,,,,"(Dozens, of, companies, in, the, sector, merge...",23,"{'Dozens': 1, 'of': 2, 'companies': 1, 'in': 2...",22,17,3.772727
3,3.0,Annual revenues for the government IT business...,3313,782,0.36,0.54,True,False,0.67,,,,"(Annual, revenues, for, the, government, IT, b...",31,"{'Annual': 1, 'revenues': 1, 'for': 1, 'the': ...",28,22,3.785714
4,4.0,The Swedish crown rose versus the dollar and e...,4024,936,0.55,0.39,True,False,0.34,,,,"(The, Swedish, crown, rose, versus, the, dolla...",39,"{'The': 1, 'Swedish': 1, 'crown': 1, 'rose': 1...",36,35,4.5


In [22]:
def extract_dict(df, dict_col):
  word_dict = {}
  for i, row in df.iterrows():
    word_dict.update(row[dict_col])
  return word_dict

In [23]:

word_dict = {}

for i, row in labeled_sentences.iterrows():
  word_dict.update(row['word_dict'])


In [30]:
num_tokens = np.sum(labeled_sentences['num_tokens'].values)
num_words = sum(labeled_sentences['num_words'].values)

res_dict = {
    'num_tokens': num_tokens ,
    'num_words': num_words,
    'num_types': len(word_dict.keys()),
    'avg_words_per_tweet': num_words/len(labeled_sentences),
    'avg_word_length': np.mean(labeled_sentences['avg_word_length'].values)
}

In [36]:
# convert to latex output

res_df = pd.DataFrame([res_dict])
print(res_df.T.to_latex())

\begin{tabular}{lr}
\toprule
{} &             0 \\
\midrule
num\_tokens          &  27772.000000 \\
num\_words           &  24649.000000 \\
num\_types           &   5944.000000 \\
avg\_words\_per\_tweet &     24.649000 \\
avg\_word\_length     &      4.719246 \\
\bottomrule
\end{tabular}



  print(res_df.T.to_latex())
