### Sentiment analysis of ABC News.

Apply the LDA model trained on Reuters.2158 against the Kaggle ABC News that all are in one group

In [90]:
import os
from collections import defaultdict
import tqdm
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import reuters
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# nltk.download('reuters')
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rtaubes/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [91]:
DATA_FILE = '../dataset/kaggle/abcnews-2017-06.csv'

In [92]:
# reuters count
LDA_MODEL = 'models/nltk_lda_cnt.pkl'
VECTORIZER = 'models/nltk_vct_cnt.pkl'
TOKENS = 'models/nltk_cnt.pkl'
MIN_DOCS = 30
# reuters ttidf
# LDA_MODEL = 'models/nltk_lda_tfidf.pkl'
# VECTORIZER = 'models/nltk_vct_tfidf.pkl'
# TOKENS = 'models/nltk_token_tfidf.pkl'

In [93]:
with open(LDA_MODEL, 'rb') as mfile:
    lda_model = pickle.load(mfile)
with open(VECTORIZER, 'rb') as mfile:
    vectorizer = pickle.load(mfile)
with open(TOKENS, 'rb') as mfile:
    train_tokens = pickle.load(mfile)
NUM_TOPICS = lda_model.components_.shape[0]
NUM_FEATURES = lda_model.components_.shape[1]
NUM_TOPICS, NUM_FEATURES

(90, 16176)

In [94]:
df = pd.read_csv(DATA_FILE, parse_dates=True)

In [95]:
df.head()

Unnamed: 0,publish_date,headline_text,headline_category
0,2017-06-01,abbott calls for special courts for returning ...,any
1,2017-06-01,abductors poured flammable liquid on woman cou...,any
2,2017-06-01,a day in the life of a country vet,any
3,2017-06-01,adelaide shivers through coldest start to winter,any
4,2017-06-01,afl scorecentre port adelaide power hawthorn h...,any


In [96]:
df = df.drop('headline_category', axis=1)

In [102]:
# Show topis and feature names known by the model
def print_topics(feature_names, n_top_words):
    print("topics in LDA model:")
    for topic_idx, topic in enumerate(lda_model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

tf_feature_names = vectorizer.get_feature_names()
n_top_words = 10
print("the first {} feature names:\n{}".format(n_top_words, tf_feature_names[:n_top_words]))
print_topics(tf_feature_names, n_top_words)

the first 10 feature names:
['---', '--i', '--will', '-agency', '-apr-', '-april', '-based', '-billion-dlr', '-billion-dlr-a-year', '-canada']
topics in LDA model:
Topic #0: bbl outokumpu markka wirsbo kern stripper metallverken three-week inter-city potomac
Topic #1: mooring roy ntt gabon easter wisenbaker pitts oahu superfund decontrol
Topic #2: bank said dollar rate pct rates market growth exchange money
Topic #3: hungary seipp austmet mti asturiana forint shorter villages janos subsistence
Topic #4: says baker volcker tool plywood brussels hike does kernels staple
Topic #5: market analysts american traders stocks said analyst week high likely
Topic #6: rice bureau lbs agriculture oats reserve bancorp portland kenya coastal
Topic #7: daily newspaper circulation zones rebound badly publishing vietnam pigs pipe
Topic #8: fee cane allowance province irrigation rivers transferred widespread fomc delegations
Topic #9: greece aide devaluation aegean nearby greek wages papandreou puts reve

Documents are sets of '**headline_text**' grouped by '**headline_category**' and **time**.<br>
The first goal is to see how documents are grouped by the model.

In [97]:
def get_text_topic(doc):
    """ find the best topic for a document """
    if not isinstance(doc, list):
        doc = [doc]
    tokens = vectorizer.transform(doc)
    estim = lda_model.transform(tokens)
    # estim is a topic distribution, matrix[1, NUM_TOPICS]
    idx = np.matrix(estim).argmax()
    return (idx, estim[0, idx])  # index and max(distribution)

def topic_name(idx):
    """ return a topic name by index """
    if idx >= NUM_TOPICS:
        raise ValueError("topic number {} exceed {}".format(idx, NUM_TOPICS-1))
    return 't{:02d}'.format(idx)

In [110]:
df_topics = pd.DataFrame(df.publish_date)
# add topics columns as 0
for inum in range(NUM_TOPICS):
    df_topics[topic_name(inum)] = 0.0
df_topics['news_score'] = 0.0
# get texts row by rows, define the best topic and make a sentiment analysis for a text
# maxi = 100
sa = SentimentIntensityAnalyzer()
for idx in tqdm.tqdm_notebook(df.index, desc='texts_by_topics'):
    text = df.loc[idx, 'headline_text']
    topic_num, _ = get_text_topic(text)
    psc = sa.polarity_scores(text)
    if psc['pos'] > psc['neg']:
        df_topics.loc[idx, 'news_score'] = 1
    elif psc['pos'] < psc['neg']:
        df_topics.loc[idx, 'news_score'] = -1
    df_topics.loc[idx, topic_name(topic_num)] = psc['pos'] - psc['neg']
#     maxi -= 1
#     if not maxi:
#         break

HBox(children=(IntProgress(value=0, description='texts_by_topics', max=25046, style=ProgressStyle(description_…

df_topics includes estimations positive/neural/negative for texts, one estimation per row.<br/>
Combine columns with few values to one 'other_topic' column

In [113]:
# how many columns have not enough data
non_suff_cols = []
for cidx in range(NUM_TOPICS):
    col_name = topic_name(cidx)
    col = df_topics[col_name]
    nonz = 0
    for elem in df_topics[topic_name(cidx)]:
        if elem:
            nonz += 1
#     print(col_name, ":", nonz)
    if nonz < MIN_DOCS:
        non_suff_cols.append(col_name)
print("non-sufficient columns to merge in one:", non_suff_cols)

non-sufficient columns to merge in one: ['t01', 't03', 't11', 't12', 't14', 't15', 't17', 't18', 't21', 't24', 't25', 't30', 't35', 't39', 't42', 't44', 't46', 't47', 't49', 't51', 't53', 't57', 't59', 't60', 't63', 't65', 't66', 't68', 't70', 't73', 't76', 't78', 't87', 't89']


Merge non sufficient columns to 'other_topics'. Because non-zero value can be only in one position of row,
it is possible to calc summ of columns

In [114]:
df_topics['other_topic'] = 0
for col_name in non_suff_cols:
    df_topics.other_topic += df_topics[col_name]
    
df_topics2 = df_topics.drop(non_suff_cols, axis=1)

In [116]:
df_topics.columns

Index(['publish_date', 't00', 't01', 't02', 't03', 't04', 't05', 't06', 't07',
       't08', 't09', 't10', 't11', 't12', 't13', 't14', 't15', 't16', 't17',
       't18', 't19', 't20', 't21', 't22', 't23', 't24', 't25', 't26', 't27',
       't28', 't29', 't30', 't31', 't32', 't33', 't34', 't35', 't36', 't37',
       't38', 't39', 't40', 't41', 't42', 't43', 't44', 't45', 't46', 't47',
       't48', 't49', 't50', 't51', 't52', 't53', 't54', 't55', 't56', 't57',
       't58', 't59', 't60', 't61', 't62', 't63', 't64', 't65', 't66', 't67',
       't68', 't69', 't70', 't71', 't72', 't73', 't74', 't75', 't76', 't77',
       't78', 't79', 't80', 't81', 't82', 't83', 't84', 't85', 't86', 't87',
       't88', 't89', 'news_score', 'other_topic'],
      dtype='object')

In [118]:
df_news = df_topics2.groupby('publish_date').sum()
# df_news.head(), df_news.tail(2)

In [121]:
df_news.to_csv('df_abc_news_score.csv')