In [146]:
import pandas as pd
import spacy
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pyLDAvis
import pyLDAvis.gensim 

import warnings
warnings.filterwarnings('ignore')

In [6]:
news = pd.read_csv('./corona_news_scraped_on_19th_march.csv')
news.head()

Unnamed: 0,datetime,text
0,"17:58 (IST), Mar 19",Those asked to undergo quarantine should follo...
1,"17:57 (IST), Mar 19",We have succeeded in keeping coronavirus at co...
2,"17:55 (IST), Mar 19",201 Indians evacuated from Iran yesterday: Avi...
3,"17:55 (IST), Mar 19",Vistara to temporarily suspend international o...
4,"17:52 (IST), Mar 19","Lav Agarwal, Joint Secretary, Health Ministry:..."


In [7]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 2 columns):
datetime    285 non-null object
text        285 non-null object
dtypes: object(2)
memory usage: 4.5+ KB


In [39]:
nlp = spacy.load('en_core_web_sm')

### Cleaning up for NLP tasks

In [94]:
# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

# Remove Punctuations
def punctuation_remover(text):
    punctuation_free_text = " ".join([char for char in text if char \
                                    not in string.punctuation])
    return punctuation_free_text


# Stop Word Removal
cached_stop_words = stopwords.words('english') # Provides 70 X Speedup
def stop_words_remover(text):
    text = text.lower().split()
    words = [word for word in text if \
             word not in cached_stop_words]
    return words

# Lemmatization
def lemmatize_words(text):
    words = nlp(text)
    return [word.lemma_ for word in words if word.lemma_ != '-PRON-']    
        

In [95]:
news['cleaned_text'] = news.text.apply(remove_html)
news['cleaned_text'] = news.cleaned_text.apply(stop_words_remover)
news['cleaned_text'] = news.cleaned_text.apply(punctuation_remover)
news['cleaned_text'] = news.cleaned_text.apply(lemmatize_words)

In [96]:
news.head()

Unnamed: 0,datetime,text,cleaned_text
0,"17:58 (IST), Mar 19",Those asked to undergo quarantine should follo...,"[ask, undergo, quarantine, follow, rule, ,, el..."
1,"17:57 (IST), Mar 19",We have succeeded in keeping coronavirus at co...,"[succeed, keep, coronavirus, containment, leve..."
2,"17:55 (IST), Mar 19",201 Indians evacuated from Iran yesterday: Avi...,"[201, indian, evacuate, iran, yesterday, :, av..."
3,"17:55 (IST), Mar 19",Vistara to temporarily suspend international o...,"[vistara, temporarily, suspend, international,..."
4,"17:52 (IST), Mar 19","Lav Agarwal, Joint Secretary, Health Ministry:...","[lav, agarwal, ,, joint, secretary, ,, health,..."


### Topic Modelling

In [97]:
# Create Corpora
dictionary = corpora.Dictionary(news.cleaned_text)

# Create document term Matrix (Bag of words)
document_bow = [dictionary.doc2bow(doc) for doc in news.cleaned_text ]

In [100]:
lda_model = LdaModel(corpus=document_bow ,num_topics=10, id2word=dictionary,passes = 100)

In [101]:
for i, topic in lda_model.print_topics(-1):
    print('Topic: {} Words :{}'.format(i,topic))

Topic: 0 Words :0.059*"," + 0.032*"." + 0.019*"coronavirus" + 0.014*"new" + 0.012*"report" + 0.011*":" + 0.010*"covid-19" + 0.010*"say" + 0.009*"case" + 0.008*"china"
Topic: 1 Words :0.022*":" + 0.020*"coronavirus" + 0.015*"march" + 0.013*"india" + 0.012*"international" + 0.012*"22" + 0.012*"land" + 0.012*"schedule" + 0.010*"level" + 0.010*"commercial"
Topic: 2 Words :0.031*"." + 0.031*"," + 0.015*"march" + 0.013*"public" + 0.013*"say" + 0.012*"state" + 0.011*"transport" + 0.010*"till" + 0.010*"coronavirus" + 0.008*"""
Topic: 3 Words :0.017*"coronavirus" + 0.017*":" + 0.013*"new" + 0.013*"," + 0.012*"case" + 0.009*"confirm" + 0.009*"." + 0.009*"india" + 0.007*"covid-19" + 0.007*"quarantine"
Topic: 4 Words :0.044*"," + 0.023*":" + 0.022*"." + 0.020*"coronavirus" + 0.018*"-" + 0.011*"say" + 0.011*"uddhav" + 0.009*"thackeray" + 0.008*""" + 0.008*"march"
Topic: 5 Words :0.025*"-" + 0.023*"," + 0.022*"." + 0.013*"coronavirus" + 0.011*"say" + 0.009*"passenger" + 0.008*"'" + 0.008*")" + 0.008

### Sentiment Analysis

In [130]:
def test_sentiment_analyzer(use_textblob = True ,use_vader=False):
    for sentence in news.text[50:60]:
        print(sentence,'\n')
        analyzed_sentence = TextBlob(sentence) if use_textblob==True  \
                             else SentimentIntensityAnalyzer().polarity_scores(sentence)
        print(analyzed_sentence.sentiment if use_textblob==True else analyzed_sentence,'\n')

In [131]:
test_sentiment_analyzer(use_textblob=True)

MEA briefs media on coronavirus 

Sentiment(polarity=0.0, subjectivity=0.0) 

One more person has been tested positive in Tamil Nadu, says health minister C Vijaya Baskar 

Sentiment(polarity=0.36363636363636365, subjectivity=0.5227272727272727) 

Chhattisgarh CM Bhupesh Baghel: Section 144 has been imposed in Raipur and all other municipal corporation areas in the state to prohibit large gatherings 

Sentiment(polarity=0.04464285714285714, subjectivity=0.4017857142857143) 

Work from home, staggered working hours for central govt employeesThe Central government on Thursday allowed 50 per cent of its employees to work from home and the remaining to attend office every day besides implementing different time slots for working hours in its effort to further check the spread of the coronavirus disease. In the first week, heads of departments have been asked to include only staff who reside in close proximity to their office or use own transport to travel to office. It suggested that three

In [132]:
test_sentiment_analyzer(use_textblob=False,use_vader=True)

MEA briefs media on coronavirus 

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} 

One more person has been tested positive in Tamil Nadu, says health minister C Vijaya Baskar 

{'neg': 0.0, 'neu': 0.795, 'pos': 0.205, 'compound': 0.5574} 

Chhattisgarh CM Bhupesh Baghel: Section 144 has been imposed in Raipur and all other municipal corporation areas in the state to prohibit large gatherings 

{'neg': 0.053, 'neu': 0.947, 'pos': 0.0, 'compound': -0.0772} 

Work from home, staggered working hours for central govt employeesThe Central government on Thursday allowed 50 per cent of its employees to work from home and the remaining to attend office every day besides implementing different time slots for working hours in its effort to further check the spread of the coronavirus disease. In the first week, heads of departments have been asked to include only staff who reside in close proximity to their office or use own transport to travel to office. It suggested that three groups of 

#### _We will use vader sentiment analyzer as results are somewhat better_

In [137]:
def sentiment_calculator(text):
    analyzer =  SentimentIntensityAnalyzer().polarity_scores(text)
    return analyzer['compound']

news['sentiment_score'] = news.text.apply(sentiment_calculator)

In [141]:
label = ( lambda x : 'positive' if x > 0.05 else 'negative' )

news['sentiment'] = news.sentiment_score.apply(label)

In [147]:
lda_display = pyLDAvis.gensim.prepare(lda_model,document_bow,dictionary,sort_topics=False)
pyLDAvis.display(lda_display)