## Importing Librariries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Imporing libraries for sentiment analysis

import spacy
import nltk
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Shakti\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# 1. Data Undestanding and Cleaning

In [2]:
dft=pd.read_csv('india-news-headlines.csv')
dft.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [25]:
dft.shape

(3650970, 3)

In [26]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 83.6+ MB


In [27]:
dft.isnull().sum()
#No null values in df


publish_date         0
headline_category    0
headline_text        0
dtype: int64

In [5]:
#Changing datatype of publish_date to datetime
dft.publish_date=dft.publish_date.apply(lambda x: str(x))
dft.publish_date=pd.to_datetime(dft["publish_date"],format='%Y%m%d')

dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 3 columns):
 #   Column             Dtype         
---  ------             -----         
 0   publish_date       datetime64[ns]
 1   headline_category  object        
 2   headline_text      object        
dtypes: datetime64[ns](1), object(2)
memory usage: 83.6+ MB


In [10]:
#Checking last date of entry
dft.publish_date.max()

Timestamp('2022-03-31 00:00:00')

#### Droping headline_category as it has no meaning because of presence of unknown entries.

In [11]:
dft=dft.drop('headline_category',1)
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650970 entries, 0 to 3650969
Data columns (total 2 columns):
 #   Column         Dtype         
---  ------         -----         
 0   publish_date   datetime64[ns]
 1   headline_text  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 55.7+ MB


#### Last date entry for news headline is on 31st march of 2020


#### Grouping all news for a date into single entry such that a day wise sentimen report can be found out.

In [13]:
dft['headline_text'] = dft.groupby(['publish_date']).transform(lambda x : ' '.join(x))

dft = dft.drop_duplicates() 
dft.shape

#36 lakhs rows diminished to 7k rows only.

(7717, 2)

In [14]:
dft.head()

Unnamed: 0,publish_date,headline_text
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...
86,2001-01-03,Powerless north India gropes in the dark Think...
127,2001-01-04,The string that pulled Stephen Hawking to Indi...
280,2001-01-05,Light combat craft takes India into club class...
406,2001-01-06,Light combat craft takes India into club class...


#### Removing Special Characters from data

In [15]:
#Defining a function to carry out text cleaning
def remove_special_characters(text, remove_digits=False):
    pattern = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Important text !! 123#@!", remove_digits=True)

'Important text  '

In [16]:
#Cleaning Text
dft.headline_text=dft.headline_text.apply(func=remove_special_characters,args=[True])

In [17]:
#Resetting index
dft.reset_index(drop=True,inplace=True)
dft.head()

Unnamed: 0,publish_date,headline_text
0,2001-01-02,Status quo will not be disturbed at Ayodhya sa...
1,2001-01-03,Powerless north India gropes in the dark Think...
2,2001-01-04,The string that pulled Stephen Hawking to Indi...
3,2001-01-05,Light combat craft takes India into club class...
4,2001-01-06,Light combat craft takes India into club class...


# 2. Sentiment Analysis

### Use of *textblob* to get subjectivity and polarity of news per day

In [18]:
# Defining functions to derive subjectivity and polarity of data

def sub(r):
    return TextBlob(r).sentiment.subjectivity

def pol(r):
    return TextBlob(r).sentiment.polarity

In [19]:
# Applying the functions on dataframe

dft['Subjectivity']=dft.headline_text.apply(sub)
dft['Polarity']=dft.headline_text.apply(pol)

dft.head(5)

# We arrive at some numerical value to textual data.

Unnamed: 0,publish_date,headline_text,Subjectivity,Polarity
0,2001-01-02,Status quo will not be disturbed at Ayodhya sa...,0.282333,0.151333
1,2001-01-03,Powerless north India gropes in the dark Think...,0.407692,0.088462
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.446847,0.090625
3,2001-01-05,Light combat craft takes India into club class...,0.476612,0.262024
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485


### performing a 'VADER' analysis to derive columns for various analysis.

#### Compund score is aggregate score for positive and negative scores

In [23]:
vader_=SentimentIntensityAnalyzer()

dft['Compound_vader']=[vader_.polarity_scores(x)['compound'] for x in dft.headline_text]
dft['Positive_vader']=[vader_.polarity_scores(x)['pos'] for x in dft.headline_text]
dft['Negative_vader']=[vader_.polarity_scores(x)['neg'] for x in dft.headline_text]
dft['Neutral_vader']=[vader_.polarity_scores(x)['neu'] for x in dft.headline_text]

dft.head()

Unnamed: 0,publish_date,headline_text,Subjectivity,Polarity,Compound_vader,Positive_vader,Negative_vader,Neutral_vader
0,2001-01-02,Status quo will not be disturbed at Ayodhya sa...,0.282333,0.151333,-0.9792,0.071,0.121,0.808
1,2001-01-03,Powerless north India gropes in the dark Think...,0.407692,0.088462,-0.1779,0.113,0.123,0.764
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.446847,0.090625,0.8047,0.103,0.1,0.797
3,2001-01-05,Light combat craft takes India into club class...,0.476612,0.262024,0.9769,0.159,0.125,0.716
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485,-0.4215,0.145,0.154,0.701


#### Lets store this final output as a dataframe such that further analysis can be done in this dataset.

In [24]:
dft.to_csv('Textual_Data.csv',index=False)