The dataset in the link is going to be used for the project:
https://www.kaggle.com/hkapoor/indian-financial-news-articles-20032020/code

In [None]:
#Import requisite libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
import string

import warnings
warnings.filterwarnings('ignore')

from collections import Counter, defaultdict
import time
from datetime import datetime
import re

# **Dataset Description:**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/data/interim/IndianFinancialNews.csv')

In [None]:
data = data.iloc[:12000, : ]

In [None]:
data.tail()

Unnamed: 0.1,Unnamed: 0,Date,Title,Description
11995,11995,"October 21, 2014, Tuesday",Irda asks SBI Life to comply with prior order ...,"In violation of Group Insurance Guidelines, Rs..."
11996,11996,"October 21, 2014, Tuesday","Loan recast hinges on coal supply to steel, po...","According to industry estimates, about Rs 3 la..."
11997,11997,"October 21, 2014, Tuesday",Gas price hike to put pressure on retail infla...,Cut in diesel price to have minimum impact on CPI
11998,11998,"October 21, 2014, Tuesday",Yes Bank raises $422 million by dual currency ...,Private sector Yes Bank today said it has rais...
11999,11999,"October 20, 2014, Monday",Irda gives special dispensation for Hudhud claims,Insurance Regulatory and Development Authority...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   12000 non-null  int64 
 1   Date         12000 non-null  object
 2   Title        12000 non-null  object
 3   Description  12000 non-null  object
dtypes: int64(1), object(3)
memory usage: 375.1+ KB


In [None]:
data.columns

Index(['Unnamed: 0', 'Date', 'Title', 'Description'], dtype='object')

In [None]:
data.shape[0]

12000

Hence we have **50000** rows in the given dataset.

We observe there is a redundant column named 'ID' in the dataset.Its better if we remove it. Also we get rid of the duplicate values prevailing in the dataset.

In [None]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.drop_duplicates(inplace=True)

In [None]:
data.shape[0]

11995

Hence we infer that there were actually some redundant values and now we have dropped them.

In [None]:
#We can see that the Date column is of object datatype. It would be better if we can conver them to datetime 64 type.
data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format=True)
data['Year'] = data['Date'].dt.year
data.head()

Unnamed: 0,Date,Title,Description,Year
0,2020-05-26,"ATMs to become virtual bank branches, accept d...","Close to 14.6 per cent (or 35,000) of the 240,...",2020
1,2020-05-26,IDFC First Bank seniors to forgo 65% of bonus ...,"V Vaidyanathan, managing director and chief ex...",2020
2,2020-05-25,"Huge scam in YES Bank for many years, says Enf...",Rana Kapoor's wife also charged with abetting ...,2020
3,2020-05-24,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",The bank said it was now gearing up to extend ...,2020
4,2020-05-23,DCB Bank's profit before tax declines 37.6% to...,Net profit for the financial year ended March ...,2020


In [None]:
year_list = data['Year'].values.tolist()
year_set = set(year_list)
year_set

{2014, 2015, 2016, 2017, 2018, 2019, 2020}

**Feature Preprocessing**

In [None]:
#Downloading the relevant libraries and dependencies in NLTK module for preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')                                                                     
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
#Initialising the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
#Just an example text to check out the statement
text_parse = "Mr. Vaidhyanathan didn't lived long enough to see the light of day in ICICI Bank at Reuters Corpus"
text = nltk.word_tokenize(text_parse)
nltk.pos_tag(text)

#Just an example text to check out the statement
text = "My name is Sagar Sinha and I am Thor, son of kind of son Odin"
list(set(nltk.word_tokenize(text)))

['is',
 'Odin',
 'I',
 'name',
 'and',
 'son',
 'Sagar',
 'of',
 'am',
 ',',
 'My',
 'Sinha',
 'Thor',
 'kind']

In [None]:
#List of all relevant POS taglists
exclude_tag_list = ['NN', 'NNS', 'NNP', 'NNPS']
adv_tag_list = ['RB', 'RBR', 'RBS']
adj_tag_list = ['JJ', 'JJR', 'JJS']
verb_tag_list = ['VB', 'VBD', 'VBG', 'VBN', 'VBP']

def title_clean(row):
  modified_title_text = []
  text2 = None

  try:
    text = row['Title']
    text = str(text)
    text = text.strip()
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    
    # for word in process_title1.split():
    # word_tag_tuple = nltk.str2tuple(word)

    words = nltk.word_tokenize(text)

    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')] 
    
    tagged_title_list = nltk.pos_tag(words)

    for (word, tag) in tagged_title_list:

      if tag not in exclude_tag_list:
        word = word.lower()

      modified_title_text.append(word)

    text2 = ' '.join(modified_title_text)

  except Exception as e:
    print(str(e))

  return text2

def desc_clean(row):
  modified_desc_text = []
  text2 = None

  try:
    text = row['Description']
    text = str(text)
    text = text.strip()
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    
    # for word in process_title1.split():
    # word_tag_tuple = nltk.str2tuple(word)

    words = nltk.word_tokenize(text)

    #words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')] 
    words = [lemmatizer.lemmatize(word) for word in words]  #I don't intend to remove the stopwords

    
    tagged_desc_list = nltk.pos_tag(words)

    for (word, tag) in tagged_desc_list:

      if tag not in exclude_tag_list:
        word = word.lower()

      modified_desc_text.append(word)

    text2 = ' '.join(modified_desc_text)

  except Exception as e:
    print(str(e))

  return text2

In [None]:
col_list = ['Title', 'Description']

data['Clean_Title'] = data.apply(lambda row : title_clean(row), axis=1)
data['Clean_Desc'] = data.apply(lambda row : desc_clean(row), axis=1)

In [None]:
data.head()

Unnamed: 0,Date,Title,Description,Year,Clean_Title,Clean_Desc
0,2020-05-26,"ATMs to become virtual bank branches, accept d...","Close to 14.6 per cent (or 35,000) of the 240,...",2020,ATMs become virtual bank branch accept deposit...,close to 14 6 per cent or 35 000 of the 240 00...
1,2020-05-26,IDFC First Bank seniors to forgo 65% of bonus ...,"V Vaidyanathan, managing director and chief ex...",2020,IDFC First Bank senior forgo 65 bonus amid Cov...,V Vaidyanathan managing director and chief exe...
2,2020-05-25,"Huge scam in YES Bank for many years, says Enf...",Rana Kapoor's wife also charged with abetting ...,2020,huge scam YES Bank many year say Enforcement D...,Rana Kapoor s wife also charged with abetting ...
3,2020-05-24,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",The bank said it was now gearing up to extend ...,2020,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,the bank said it wa now gearing up to extend t...
4,2020-05-23,DCB Bank's profit before tax declines 37.6% to...,Net profit for the financial year ended March ...,2020,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,net profit for the financial year ended March ...


In [None]:
#title_text = str(row['Clean_Title'])
#desc_text = str(row['Clean_Desc'])

#comb_text = ".".join([title_text, desc_text])

#sent_tokens = list(set(nltk.word_tokenize(comb_text)))

#return sent_tokens

#for row in (data.index):
  #data['Combined_Text'] = data.loc[row ['Clean_Title']] + data.loc[row : ['Clean_Desc']]

data['Combined_Text'] = data['Clean_Title'] + data['Clean_Desc']

In [None]:
#The text needs to be tagged in order to perform some essential FE techniques. Also essential for feature engineering 
def tagged_text(row):
  text = str(row['Combined_Text'])
  text = nltk.word_tokenize(text)
  tagged_list2 = nltk.pos_tag(text)

  return tagged_list2

In [None]:
data.head(20) 

Unnamed: 0,Date,Title,Description,Year,Clean_Title,Clean_Desc,Combined_Text
0,2020-05-26,"ATMs to become virtual bank branches, accept d...","Close to 14.6 per cent (or 35,000) of the 240,...",2020,ATMs become virtual bank branch accept deposit...,close to 14 6 per cent or 35 000 of the 240 00...,ATMs become virtual bank branch accept deposit...
1,2020-05-26,IDFC First Bank seniors to forgo 65% of bonus ...,"V Vaidyanathan, managing director and chief ex...",2020,IDFC First Bank senior forgo 65 bonus amid Cov...,V Vaidyanathan managing director and chief exe...,IDFC First Bank senior forgo 65 bonus amid Cov...
2,2020-05-25,"Huge scam in YES Bank for many years, says Enf...",Rana Kapoor's wife also charged with abetting ...,2020,huge scam YES Bank many year say Enforcement D...,Rana Kapoor s wife also charged with abetting ...,huge scam YES Bank many year say Enforcement D...
3,2020-05-24,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",The bank said it was now gearing up to extend ...,2020,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,the bank said it wa now gearing up to extend t...,Bank Maharashtra sanctioned Rs 2 789 cr loan M...
4,2020-05-23,DCB Bank's profit before tax declines 37.6% to...,Net profit for the financial year ended March ...,2020,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,net profit for the financial year ended March ...,DCB Bank profit tax decline 37 6 Rs 93 84 cror...
5,2020-05-23,"Bank of Baroda to advance Rs 12,000 crore to M...","Under the scheme, the government will offer 10...",2020,Bank Baroda advance Rs 12 000 crore MSMEs govt...,under the scheme the government will offer 100...,Bank Baroda advance Rs 12 000 crore MSMEs govt...
6,2020-05-22,"Retail, MSME loans to get cheaper as RBI cuts ...",HDFC Bank cuts base rate by 55 bps; SBI calls ...,2020,retail MSME loan get cheaper RBI cut repo rate...,HDFC Bank cut base rate by 55 bps SBI call ALC...,retail MSME loan get cheaper RBI cut repo rate...
7,2020-05-22,"RBI forecasts economic contraction, announces ...",The six-member MPC also kept policy stance unc...,2020,RBI forecast economic contraction announces sh...,the six member MPC also kept policy stance unc...,RBI forecast economic contraction announces sh...
8,2020-05-22,"RBI extends loan moratorium till Aug 31, silen...",The central bank has increased group exposure ...,2020,RBI extends loan moratorium till Aug 31 silent...,the central bank ha increased group exposure l...,RBI extends loan moratorium till Aug 31 silent...
9,2020-05-22,"Govt seeks to raise Rs 14,000 cr from second t...",Two new series will have maturities of April 2...,2020,Govt seek raise Rs 14 000 cr second tranche Bh...,two new series will have maturity of April 202...,Govt seek raise Rs 14 000 cr second tranche Bh...


In [None]:
data['Combined_Tag_Tokens'] = data.apply(lambda row : tagged_text(row), axis=1)

In [None]:
#Extracting features from out text. An important constituent of feature engineering.
def noun_count(row):
  _nncnt = 0

  tag_tuple_list = list(row['Combined_Tag_Tokens'])
  
  for (word, tag) in tag_tuple_list:
    if tag in exclude_tag_list:
      _nncnt += 1

  return _nncnt

def adv_count(row):
  _advcnt = 0
  tag_tuple_list = list(row['Combined_Tag_Tokens'])
  
  for (word, tag) in tag_tuple_list:
    if tag in adv_tag_list:
      _advcnt += 1

  return _advcnt

def adj_count(row):
  _adjcnt = 0
  tag_tuple_list = list(row['Combined_Tag_Tokens'])

  for (word, tag) in tag_tuple_list:
    if tag in adj_tag_list:
      _adjcnt += 1

  return _adjcnt

def verb_count(row):
  _verbcnt = 0
  tag_tuple_list = row['Combined_Tag_Tokens']
  
  for (word, tag) in tag_tuple_list:
    if tag in verb_tag_list:
      _verbcnt += 1

  return _verbcnt

In [None]:
data['Title_Noun_Count'] = data.apply(lambda row : noun_count(row), axis=1)
data['Title_Adverb_Count'] = data.apply(lambda row : adv_count(row), axis=1)
data['Title_Adjective_Count'] = data.apply(lambda row : adj_count(row), axis=1)
data['Title_Verb_Count'] = data.apply(lambda row : verb_count(row), axis=1)

In [None]:
#data.drop(['Noun_Count'], axis=1, inplace=True)
#data.drop(['Adverb_Count'], axis=1, inplace=True)
#data.drop(['Adjective_Count'], axis=1, inplace=True)

**Calculate Text similarity using Jaccard index:**

In [None]:
def jaccard_index(row):
  _textA = str(row['Clean_Title']).split()
  _textB = str(row['Clean_Desc']).split()

  set1 = set(_textA)
  set2 = set(_textB)

  intersection = set1.intersection(set2)
  union = set1.union(set2)

  return len(intersection)/len(union)

In [None]:
data['Jaccard_Index'] = data.apply(lambda row : jaccard_index(row), axis=1)

**Adding more features such as number of words, number of unique words, number of characters, stopwords, punctuations, uppercase words, title case words and average length of the words. Will use them if required, later**

In [None]:
#Number of words in the text ##
data['num_words'] = data['Combined_Text'].apply(lambda x: len(str(x).split()))

#Number of unique words in the text ##
data['num_unique_words'] =  data['Combined_Text'].apply(lambda x: len(set(str(x).split())))

#Number of characters in the text ##
data['num_chars'] = data['Combined_Text'].apply(lambda x: len(str(x)))

#Number of stopwords in the text ##
data['num_stopwords'] = data['Combined_Text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))

#Number of punctuations in the text ##
data['num_punctuations'] = data['Combined_Text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

#Number of title case words in the text ##
data['num_words_upper'] = data['Combined_Text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

#Number of title case words in the text 
data['num_words_title'] = data['Combined_Text'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

#Average length of the words in the text ##
data['mean_word_len'] = data['Combined_Text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
data.head(20) 

Unnamed: 0,Date,Title,Description,Year,Clean_Title,Clean_Desc,Combined_Text,Combined_Tag_Tokens,Title_Noun_Count,Title_Adverb_Count,Title_Adjective_Count,Title_Verb_Count,Jaccard_Index,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,mean_word_len
0,2020-05-26,"ATMs to become virtual bank branches, accept d...","Close to 14.6 per cent (or 35,000) of the 240,...",2020,ATMs become virtual bank branch accept deposit...,close to 14 6 per cent or 35 000 of the 240 00...,ATMs become virtual bank branch accept deposit...,"[(ATMs, NNP), (become, VBP), (virtual, JJ), (b...",11,2,2,4,0.028571,36,34,187,11,0,0,1,4.222222
1,2020-05-26,IDFC First Bank seniors to forgo 65% of bonus ...,"V Vaidyanathan, managing director and chief ex...",2020,IDFC First Bank senior forgo 65 bonus amid Cov...,V Vaidyanathan managing director and chief exe...,IDFC First Bank senior forgo 65 bonus amid Cov...,"[(IDFC, NNP), (First, NNP), (Bank, NNP), (seni...",13,0,2,4,0.0,28,28,172,4,0,1,4,5.178571
2,2020-05-25,"Huge scam in YES Bank for many years, says Enf...",Rana Kapoor's wife also charged with abetting ...,2020,huge scam YES Bank many year say Enforcement D...,Rana Kapoor s wife also charged with abetting ...,huge scam YES Bank many year say Enforcement D...,"[(huge, JJ), (scam, NNS), (YES, NNP), (Bank, N...",9,1,2,4,0.0,17,17,107,2,0,1,3,5.352941
3,2020-05-24,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",The bank said it was now gearing up to extend ...,2020,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,the bank said it wa now gearing up to extend t...,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,"[(Bank, NNP), (Maharashtra, NNP), (sanctioned,...",13,1,1,7,0.0,31,30,183,7,0,0,7,4.935484
4,2020-05-23,DCB Bank's profit before tax declines 37.6% to...,Net profit for the financial year ended March ...,2020,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,net profit for the financial year ended March ...,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,"[(DCB, NNP), (Bank, NNP), (profit, NN), (tax, ...",16,2,1,2,0.129032,36,30,171,6,0,2,5,3.777778
5,2020-05-23,"Bank of Baroda to advance Rs 12,000 crore to M...","Under the scheme, the government will offer 10...",2020,Bank Baroda advance Rs 12 000 crore MSMEs govt...,under the scheme the government will offer 100...,Bank Baroda advance Rs 12 000 crore MSMEs govt...,"[(Bank, NNP), (Baroda, NNP), (advance, NN), (R...",13,0,0,1,0.047619,22,21,126,4,0,0,3,4.772727
6,2020-05-22,"Retail, MSME loans to get cheaper as RBI cuts ...",HDFC Bank cuts base rate by 55 bps; SBI calls ...,2020,retail MSME loan get cheaper RBI cut repo rate...,HDFC Bank cut base rate by 55 bps SBI call ALC...,retail MSME loan get cheaper RBI cut repo rate...,"[(retail, JJ), (MSME, NNP), (loan, NN), (get, ...",13,0,4,4,0.086957,25,22,130,2,0,4,1,4.24
7,2020-05-22,"RBI forecasts economic contraction, announces ...",The six-member MPC also kept policy stance unc...,2020,RBI forecast economic contraction announces sh...,the six member MPC also kept policy stance unc...,RBI forecast economic contraction announces sh...,"[(RBI, NNP), (forecast, VBP), (economic, JJ), ...",11,1,6,3,0.0,28,27,173,5,0,2,0,5.214286
8,2020-05-22,"RBI extends loan moratorium till Aug 31, silen...",The central bank has increased group exposure ...,2020,RBI extends loan moratorium till Aug 31 silent...,the central bank ha increased group exposure l...,RBI extends loan moratorium till Aug 31 silent...,"[(RBI, NNP), (extends, VBZ), (loan, NN), (mora...",12,0,3,1,0.0,24,24,143,3,0,1,1,5.0
9,2020-05-22,"Govt seeks to raise Rs 14,000 cr from second t...",Two new series will have maturities of April 2...,2020,Govt seek raise Rs 14 000 cr second tranche Bh...,two new series will have maturity of April 202...,Govt seek raise Rs 14 000 cr second tranche Bh...,"[(Govt, NNP), (seek, NN), (raise, NN), (Rs, NN...",12,0,3,1,0.0,23,22,121,4,0,0,6,4.304348


In [None]:
#Dropping stopwords from the list
print(data['num_stopwords'].value_counts())

#Dropping punctuations from the list
print(data['num_punctuations'].value_counts())

In [None]:
#Since the count of one particular value in columns of 'stopwords' and 'punctuations' are so high, we need to drop one of them
data.drop(['num_stopwords', 'num_punctuations'], axis=1, inplace=True)

In [None]:
#The pandas 'to_csv' function stores the file as a csv. Here we intend to replace the existing dataset with a processed one for demonstration.
data.to_csv('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/data/processed/processed_data.csv')