### Sentiment analysis of Irish news

Irish News already separated to topics by the authors.

In [1]:
import os
from collections import defaultdict
import tqdm
import pickle
import numpy as np
import pandas as pd
import nltk
# from nltk.corpus import reuters
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# nltk.download('reuters')

In [14]:
DATA_FILE = '../dataset/kaggle/irishtimes-2017-06.csv'
MIN_DOCS = 30  # categories with number documents less than MIN_DOCS are excluded

In [4]:
df = pd.read_csv(DATA_FILE, parse_dates=True)
df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,2017-06-01,business,More than half of multinationals change tax pl...
1,2017-06-01,business.agribusiness-and-food,Dairy fares best but all farm incomes are far ...
2,2017-06-01,business.commercial-property,IBRC wants a Belfast court to allow it to take...
3,2017-06-01,business.construction,Quarter of adults see homes as inadequate; say...
4,2017-06-01,business.construction,Builders react with alarm to site levy proposal


Documents are sets of '**headline_text**' grouped by '**headline_category**' and **time**.<br>
The first goal is to see how documents are grouped by the model.

In [5]:
categories = set(df.headline_category)
sorted(categories)

['business',
 'business.agribusiness-and-food',
 'business.commercial-property',
 'business.companies',
 'business.construction',
 'business.economy',
 'business.energy-and-resources',
 'business.financial-services',
 'business.health-pharma',
 'business.innovation',
 'business.manufacturing',
 'business.markets',
 'business.media-and-marketing',
 'business.personal-finance',
 'business.retail-and-services',
 'business.technology',
 'business.transport-and-tourism',
 'business.work',
 'news.law',
 'news.law.courts',
 'news.law.courts.circuit-court',
 'news.law.courts.coroner-s-court',
 'news.law.courts.criminal-court',
 'news.law.courts.district-court',
 'news.law.courts.high-court',
 'news.law.courts.supreme-court',
 'news.politics',
 'news.politics.oireachtas',
 'news.world',
 'news.world.africa',
 'news.world.asia-pacific',
 'news.world.europe',
 'news.world.middle-east',
 'news.world.uk',
 'news.world.us']

Remove categories that include only few documents

In [12]:
docs_in_cat = {cat: len(df[df.headline_category == cat]) for cat in categories}
sorted(docs_in_cat.items(), key=lambda x: x[1], reverse=True)

[('news.politics', 1386),
 ('news.law', 1061),
 ('business.economy', 939),
 ('news.world.europe', 927),
 ('business.technology', 874),
 ('news.world.us', 686),
 ('business.financial-services', 649),
 ('business.transport-and-tourism', 592),
 ('news.world.uk', 483),
 ('news.world.asia-pacific', 427),
 ('business.commercial-property', 424),
 ('business.retail-and-services', 358),
 ('news.law.courts.high-court', 349),
 ('business.media-and-marketing', 329),
 ('business', 304),
 ('news.politics.oireachtas', 302),
 ('business.markets', 291),
 ('news.world.middle-east', 247),
 ('news.world', 222),
 ('business.personal-finance', 219),
 ('news.law.courts.circuit-court', 205),
 ('business.agribusiness-and-food', 203),
 ('business.energy-and-resources', 184),
 ('business.construction', 171),
 ('business.work', 155),
 ('news.world.africa', 148),
 ('business.manufacturing', 137),
 ('business.innovation', 136),
 ('news.law.courts.district-court', 128),
 ('business.health-pharma', 126),
 ('news.law.

docs_in_cat shows that this set of news is highly unbalanced. Just remove categories where number of documents too small

In [16]:
categories2 = {idx: val for idx, val in docs_in_cat.items() if val > MIN_DOCS}
len(categories), len(categories2)

(35, 33)

### Sentiment analysis by categories and in common

In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rtaubes/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




True

create a new data frame with categories as columns and rows as positive/negative estimation combined by date

'other cat' is a sum of categories which have too small number of documents

In [28]:
df_sn2 = df.copy()
df_sn2['news_score'] = 0.0
for cat in categories2:
    df_sn2[cat] = 0.0

# Note that 'pos' and 'neg' are always not negative
for idx in tqdm.tqdm_notebook(df_sn2.index, desc='index'):
    df_row = df_sn2.iloc[idx]
    sa = SentimentIntensityAnalyzer()
    if df_row.headline_category in categories2:
        cat = df_row.headline_category
    else:
        cat = 'other_cat'
    psc = sa.polarity_scores(df_row.headline_text)
    if psc['pos'] > psc['neg']:
        df_sn2.loc[idx, 'news_score'] = 1
    elif psc['pos'] < psc['neg']:
        df_sn2.loc[idx, 'news_score'] = -1
    df_sn2.loc[idx, cat] = psc['pos'] - psc['neg']
#     df_sn2.loc[idx, 'news_score']

HBox(children=(IntProgress(value=0, description='index', max=12918, style=ProgressStyle(description_width='ini…




In [29]:
# Actually 'headline_category' and 'headline_text' can be removed by 'groupby' because they are not numeric.
df_sn3 = df_sn2.drop(['headline_category', 'headline_text'], axis=1)

In [30]:
df_sn4 = df_sn3.groupby('publish_date').sum()

In [31]:
df_sn4.head()

Unnamed: 0_level_0,news_score,business.energy-and-resources,business.innovation,news.law.courts.circuit-court,news.politics,business.agribusiness-and-food,news.politics.oireachtas,news.world.middle-east,business.construction,news.law.courts.district-court,...,business.manufacturing,news.law,news.law.courts.coroner-s-court,business.commercial-property,news.law.courts.criminal-court,news.world,news.world.europe,news.law.courts,news.world.us,other_cat
publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-01,-6.0,0.0,0.379,-0.167,0.471,0.206,-0.943,0.0,-0.262,-0.302,...,-0.145,-1.24,0.0,0.119,-0.852,0.0,0.18,0.0,-0.035,0.066
2017-06-02,-1.0,0.0,0.0,0.216,1.048,0.0,0.219,0.0,0.0,0.0,...,0.0,-0.584,-0.095,0.0,-0.615,0.252,-0.321,0.0,0.456,-0.449
2017-06-03,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.661,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.31,0.0,0.298,0.0
2017-06-04,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.52,0.0,0.0,0.0,0.0
2017-06-05,-5.0,0.342,0.0,0.0,0.0,0.0,0.0,-0.022,0.0,0.0,...,0.0,-0.223,0.0,-0.231,0.0,0.0,-0.247,0.0,0.0,0.0


In [32]:
df_sn4.to_csv('df_irish_news_score.csv')