# Objective : Identify topics or tags/labels from news articles and map to prospective sales vertical in a prescriptive way.

Author : 
Praveen Vijayan ,
TCS A&I - Pune

# Importing libraries

In [144]:
import csv
import re
import codecs
import os
import spacy
from __future__ import unicode_literals
import pandas as pd
import itertools as it
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence


In [145]:
from __future__ import print_function
import numpy as np
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import string
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

In [146]:
nlp = spacy.load('en_core_web_sm')

In [147]:
# read CSV file & load into list
with open("sub_News_data_train.csv",'r',encoding="utf8") as my_file:
    reader = csv.reader(my_file, delimiter=',')
    news_data_train = list(reader)

# Exploring the data structure

In [148]:
print(news_data_train[3])

['1', 'Nortel shares down following news of RCMP probe', 'TORONTO - Shares of Nortel Networks dipped on the TSX in the wake of news the RCMP has launched an investigation into accounting practices at the telecommunications company. ']


In [149]:
print("class label : " + news_data_train[3][0])

class label : 1


In [150]:
print("Heading : " + news_data_train[3][1])

Heading : Nortel shares down following news of RCMP probe


In [151]:
print("Content : " +news_data_train[3][2])

Content : TORONTO - Shares of Nortel Networks dipped on the TSX in the wake of news the RCMP has launched an investigation into accounting practices at the telecommunications company. 


In [152]:
stopwords_file = open("stopwords.txt", "r" , encoding="utf8") 
stopwords = stopwords_file.read()

In [153]:
STOPWORD_customized = {'will','new','york','will' ,'quot','year ','company ','week', 'one','two','three','reuters','reuters' , 'monday','tuesday','wednesday','thursday','friday','saturday','sunday','yesterday'}

In [154]:
#label = [] # class
#text_doc = [] # news text 
text_all = ""
c =0 # to control count of 
for i in news_data_train:
    c= c+1
    if c < 3000 :
        #label.append(i[0])            
        merged = i[1]+" : "+i[2] # combining header and content 
        lowers = merged.lower()   # to lower      
        no_punctuation = re.sub(r'[^\w\s]',' ', lowers)    # remove punctuation
        temp = ' '.join(word for word in no_punctuation.split() if len(word)>3) # remove samll words
        filtered = ' '.join(word for word in temp.split() if word not in stopwords.split()) # remove stopwords
        filtered = ' '.join(word for word in filtered.split() if word not in STOPWORD_customized) # remove stopwords
        temp = filtered
        #text_doc.append(temp)
        text_all = text_all + temp + ". " 

In [155]:
del news_data_train

In [156]:
%%time
parsed_review = nlp(text_all)

Wall time: 14.2 s


In [157]:
del text_all

# Entity identification 

In [158]:
Entity_list = ""
for num, entity in enumerate(parsed_review.ents):
    #print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    #print('')
    Entity_list = str(Entity_list) + str(entity) + " : " + entity.label_ + "; "

In [159]:
review_txt_filepath = "sub_News_data_train.csv"

In [160]:
Entity_list.split(';')

['just days : DATE',
 ' 2004 : DATE',
 ' thornton : CARDINAL',
 ' billion : CARDINAL',
 ' deal 2002 : DATE',
 ' seven : CARDINAL',
 ' 3500 cent : MONEY',
 ' annual : DATE',
 ' millions : CARDINAL',
 ' 2004 : DATE',
 ' millions dollars : MONEY',
 ' march : DATE',
 ' british : NORP',
 ' weekend : DATE',
 ' billion : CARDINAL',
 ' million : CARDINAL',
 ' million : CARDINAL',
 ' million : CARDINAL',
 ' million : CARDINAL',
 ' quarterly : DATE',
 ' quarterly : DATE',
 ' second quarter : DATE',
 ' quarterly : DATE',
 ' second : ORDINAL',
 ' fourth quarter : DATE',
 ' september 2004 : DATE',
 ' fourth quarter year 2004 : DATE',
 ' 2001 : DATE',
 ' 2003 : DATE',
 ' annual : DATE',
 ' current quarter : DATE',
 ' month : DATE',
 ' 1999 : DATE',
 ' billion dollars : MONEY',
 ' 75bn : CARDINAL',
 ' 1999 : DATE',
 ' 1999 : DATE',
 ' nearly trillion : CARDINAL',
 ' 1999 : DATE',
 ' 1999 : DATE',
 ' 1999 : DATE',
 ' fourth : ORDINAL',
 ' 1999 2003 : DATE',
 ' nearly trillion : CARDINAL',
 ' 1996 : DA

# part_of_speech

In [161]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_pos)),columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,krispy,CCONJ
1,kreme,VERB
2,leaves,NOUN
3,chicago,ADV
4,krispy,VERB
5,kreme,VERB
6,doughnuts,NOUN
7,href,NOUN
8,http,VERB
9,investor,NOUN


In [162]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,krispy,krispy,xxxx
1,kreme,kreme,xxxx
2,leaves,leave,xxxx
3,chicago,chicago,xxxx
4,krispy,krispy,xxxx
5,kreme,kreme,xxxx
6,doughnuts,doughnut,xxxx
7,href,href,xxxx
8,http,http,xxxx
9,investor,investor,xxxx


In [163]:
# parsed_NLP_text.txt

In [164]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space 

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [165]:
unigram_sentences_filepath = os.path.join('unigram_sentences_all.txt')

In [166]:
%%time
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus('parsed_NLP_text.txt'):
            f.write(sentence + '\n')

Wall time: 16.3 s


In [167]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [168]:
bigram_model_filepath = 'bigram_model_all.txt'

In [169]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 173 ms


In [170]:
bigram_sentences_filepath  = 'bigram_model_all.txt'

In [171]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(bigram_sentences_filepath, 'w' , encoding= 'utf-8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')



Wall time: 253 ms


In [172]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [173]:
trigram_model_filepath = 'trigram_model_all.txt'

In [174]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 259 ms


In [175]:
trigram_sentences_filepath = 'trigram_sentences_all.txt'

In [176]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [177]:
trigram_reviews_filepath = 'trigram_transformed_reviews_all.txt'

# From normal text to trigram  

In [180]:
review_txt_filepath = 'parsed_NLP_text.txt' # "sub_News_data_train.csv"


In [181]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=1000, n_threads=4):
            
            #merged = parsed_review # combining header and content 
            #lowers = merged.lower()   # to lower      
            #no_punctuation = re.sub(r'[^\w\s]',' ', lowers)    # remove punctuation
            #temp = ' '.join(word for word in no_punctuation.split() if len(word)>3) # remove samll words
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in nlp.Defaults.stop_words]
            
            
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')



Wall time: 16.3 s


In [182]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
#import cPickle as pickle

In [183]:
trigram_dictionary_filepath = 'trigram_dict_all.dict'

In [184]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 1 == 1:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    #trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 123 ms


In [185]:
trigram_bow_filepath = 'trigram_bow_corpus_all.mm'

In [186]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [187]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 1 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

Wall time: 222 ms


In [188]:
lda_model_filepath = 'lda_model_all.txt'

In [208]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 1 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=10,
                           id2word=trigram_dictionary,
                           workers=4)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

Wall time: 39.7 s


In [209]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [247]:
#explore_topic(topic_number=3)

In [211]:
LDAvis_data_filepath = 'ldavis_prepared.txt'

In [212]:
%%time
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,trigram_dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


Wall time: 8.21 s


In [213]:
pyLDAvis.display(LDAvis_prepared)

# Topic of a given text 

In [270]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    #trigram_review = [term for term in trigram_review
    #                  if not term in spacy.en.STOPWORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    #review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print("The text belongs to topic (with probability): ")
        print('{:25} {}'.format(topic_number,
                                round(freq, 3)))

In [267]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [268]:
sample_review = get_sample_review(500)
print(sample_review)

threat airbus jobsa pact spelling state given rival aircraft makers airbus boeing renegotiated issue unlikely pose immediate threat jobs aerospace workers merseyside. 



In [269]:
lda_description(sample_review)



The text belongs to topic (with probability): 
                        7 0.955


In [248]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(bigram_sentences_filepath)
word2vec_filepath = 'word2vec_model_all.txt'

In [249]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 1 == 1:

    # initiate the model and perform the first epoch of training
    news2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=1)
    
    news2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        news2vec.train(trigram_sentences,total_examples=1000,epochs=i)
        news2vec.save(word2vec_filepath)
        
# load the finished model from disk
news2vec = Word2Vec.load(word2vec_filepath)
news2vec.init_sims()

print(u'{} training epochs so far.'.format(news2vec.train_count))

12 training epochs so far.
Wall time: 8.63 s


# Get related words using word2vec

In [250]:
def get_related_terms(token, topn=15):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in news2vec.most_similar(positive=[token], topn=topn):

        print(u'{:20} {}'.format(word, round(similarity, 3)))

In [246]:
get_related_terms(u'airbus')

boeing               0.677
aircraft_maker       0.675
european_union       0.667
plane                0.604
world_trade          0.579
subsidy              0.573
aircraft             0.557
boe                  0.557
agreement            0.493
delivery             0.49
launch               0.482
battle               0.467
dispute              0.456
organization         0.451
shareholder          0.43


In [242]:
get_related_terms(u'aircraft')

plane                0.626
boeing               0.61
delivery             0.559
airbus               0.557
dispute              0.539
european_union       0.526
carrier              0.526
boe                  0.517
aircraft_maker       0.496
airline              0.475
reduce               0.468
battle               0.467
airway               0.457
commercial_aircraft  0.449
defense              0.448


In [243]:
get_related_terms(u'citigroup')

financial_service    0.645
private_bank         0.579
bank                 0.488
regulator            0.464
japanese             0.459
investment           0.452
unit                 0.448
japan                0.445
fine                 0.44
morgan               0.43
investment_bank      0.423
time_warner          0.417
parmalat             0.414
british              0.413
regulatory           0.412


In [245]:
get_related_terms(u'crude')

price                0.708
barrel               0.671
ahead_winter         0.615
near_barrel          0.608
close_barrel         0.607
global_supply        0.604
high                 0.603
london               0.578
fresh_record         0.559
winter_heat          0.557
inventory            0.551
gasoline             0.535
record_high          0.524
rebound              0.511
ease                 0.508
