## **Jaccard similarity and LDA-Topic Modelling as a Feature Extraction Technique**

In [None]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

# <font color = red> Importing the Data Set

In [None]:
DATASET_PATH = os.path.join(os.getcwd(),"C:\\Users\\Shreyansh\\Abhinav\\data","")

In [None]:
Train_data = pd.read_csv(r'C:\Users\Shreyansh\Capstone_files\real_final_test_data.csv' , encoding='ISO-8859-1')

In [None]:
Train_data.columns

Index(['Unnamed: 0', 'title', 'text', 'summary', 'keywords', 'headline',
       'domain', 'Category'],
      dtype='object')

In [None]:
Train_data['Category'].nunique()

4

In [None]:
Train_data = Train_data.rename(columns = {'headline': 'Headline', 'text': 'articleBody'}, inplace = False)

In [None]:
Train_data = Train_data.rename(columns = {'Category':'Stance'}, inplace = False)

In [None]:
print('The number of rows ',Train_data.shape[0])
print('The number of columns',Train_data.shape[1])

The number of rows  212
The number of columns 8


In [None]:
Train_data["Headline"].nunique()

25

In [None]:
Train_data.columns

Index(['Unnamed: 0', 'title', 'articleBody', 'summary', 'keywords', 'Headline',
       'domain', 'Stance'],
      dtype='object')

In [None]:
Train_data = Train_data[['Headline', 'Stance', 'articleBody']]

In [None]:
Train_data.shape

(212, 3)

# <font color = red> Text Preprocessing

In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer# Text Cleaning And Preprocessing
def clean_text(df):
    all_reviews = list()
    
    # Taking all the text from the Data Frame
    lines = df.values.tolist()
    
    #Iterating Through All the Documents
    for text in lines:
        text = str(text).lower()          # Convert into Lower case
        
        
        #Removing all the URL Links from the Corpus and replace them with empty or common string
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        
        #Removing the Emojis and replacing them from empty or common string
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text = emoji.sub(r'', text)
        
        
        # Expanding Common Short Forms
        # Not very Helpful in Sentimental Analysis
        # These are pronoun and verbs.
        # Important if you want to create a text from your model along with Classifying the sentinment.
        # Helpful when you are creating Chatbot.
        
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        
        # Removing All the Punctuation
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        
        # Tokenize the Sentences(Split the sentences by Spaces)
        #(Can also Use Split Function), But Tokenize function returns the output as a list
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        
        # Removing Everything which is not a Alphabet
        stripped = [w.translate(table) for w in tokens]
        
         # Removing The Stop words
        words = [word for word in stripped if word.isalpha()]
#         stop_words = set(stopwords.words("english"))
#         stop_words.discard("not")    # Discarding Not from stopwords (can be important or  sentiment analysis)
#         words = [w for w in words if not w in stop_words]


        words = ' '.join(words)     # Joining the Token
        all_reviews.append(words)     # Adding into Large List.
    return all_reviews


  pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')


In [None]:
#Train_data["Headline"] = clean_text(Train_data["Headline"])

In [None]:
Train_data["Headline"].iloc[0]

'AB de Villiers and wife Danielle welcomed their new-born daughter Yente to their family'

In [None]:
def process (df, var):   # text cleaning
    
    for i in var:
        df[i] = clean_text(df[i])
    return df

In [None]:
Train_data1 = process(Train_data, ["Headline", "articleBody"])

In [None]:
Train_data1.head()

Unnamed: 0,Headline,Stance,articleBody
0,ab de villiers and wife danielle welcomed thei...,Agree,south africa and royal challengers bangalore b...
1,ab de villiers and wife danielle welcomed thei...,Agree,by ani pretoria former south african cricketer...
2,ab de villiers and wife danielle welcomed thei...,Agree,pretoria former south african cricketer ab de ...
3,ab de villiers and wife danielle welcomed thei...,Agree,former south african cricketer ab de villiers ...
4,ab de villiers and wife danielle welcomed thei...,Agree,former south african skipper ab de villiers an...


In [None]:
Train_data1 = Train_data1.dropna()

In [None]:
pwd

'C:\\Users\\Shreyansh\\Capstone_files'

In [None]:
#Train_data1.to_csv("clean.csv")

In [None]:
Train_data1

Unnamed: 0,Headline,Stance,articleBody
0,ab de villiers and wife danielle welcomed thei...,Agree,south africa and royal challengers bangalore b...
1,ab de villiers and wife danielle welcomed thei...,Agree,by ani pretoria former south african cricketer...
2,ab de villiers and wife danielle welcomed thei...,Agree,pretoria former south african cricketer ab de ...
3,ab de villiers and wife danielle welcomed thei...,Agree,former south african cricketer ab de villiers ...
4,ab de villiers and wife danielle welcomed thei...,Agree,former south african skipper ab de villiers an...
...,...,...,...
207,death of former lok sabha speaker sumitra mahajan,Unrelated,new delhi appreciating cooperation extended by...
208,death of former lok sabha speaker sumitra mahajan,Unrelated,former lok sabha speaker p a sangma who had a ...
209,death of former lok sabha speaker sumitra mahajan,Unrelated,outgoing lok sabha speaker and bharatiya janat...
210,death of former lok sabha speaker sumitra mahajan,Unrelated,parliament was adjourned on the first day of a...


In [None]:
Train_data1[Train_data1.Stance ==  "unreleted"] =  "unrelated"

In [None]:
Train_data1["Stance"].unique()

array(['Agree', 'Disagree', 'Unrelated', 'Discuss'], dtype=object)

In [None]:
Train_data1.isnull().sum()

Headline       0
Stance         0
articleBody    0
dtype: int64

## <font color = blue> Encoding Stances

In [None]:
Train_data1['stance_cat'] = Train_data1['Stance']

In [None]:
Train_data1.replace({'stance_cat':{'Agree':0, 'Disagree':1 ,'Discuss':2,'Unrelated':3}}, inplace=True)

In [None]:
type(Train_data1['Stance'])

pandas.core.series.Series

In [None]:
Train_data1.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat
0,ab de villiers and wife danielle welcomed thei...,Agree,south africa and royal challengers bangalore b...,0
1,ab de villiers and wife danielle welcomed thei...,Agree,by ani pretoria former south african cricketer...,0
2,ab de villiers and wife danielle welcomed thei...,Agree,pretoria former south african cricketer ab de ...,0
3,ab de villiers and wife danielle welcomed thei...,Agree,former south african cricketer ab de villiers ...,0
4,ab de villiers and wife danielle welcomed thei...,Agree,former south african skipper ab de villiers an...,0


## <font color = blue>  Removing Stop Words

In [None]:
stopwords_english = set(stopwords.words('english'))
Train_data1['Headline'] = Train_data1.Headline.apply(lambda x:str(x))
Train_data1.loc[:,'Headline'] = Train_data1['Headline'].apply(lambda x : str.lower(x))
Train_data1.loc[:,'Headline'] = Train_data1['Headline'].apply(lambda x:' '.join(re.findall('[\w]+',x)))
Train_data1.loc[:,'articleBody'] = Train_data1['articleBody'].apply(lambda x : str.lower(x))
Train_data1.loc[:,'articleBody'] = Train_data1['articleBody'].apply(lambda x:' '.join(re.findall('[\w]+',x)))


def remove_stopwords(s):
    return ' '.join(word for word in s.split() if word not in stopwords_english)

Train_data1['Headline'] = Train_data1['Headline'].apply(lambda x:remove_stopwords(x))
Train_data1['articleBody'] = Train_data1['articleBody'].apply(lambda x:str(x))
Train_data1['articleBody'] = Train_data1['articleBody'].apply(lambda x:remove_stopwords(x))

  Train_data1.loc[:,'Headline'] = Train_data1['Headline'].apply(lambda x:' '.join(re.findall('[\w]+',x)))
  Train_data1.loc[:,'articleBody'] = Train_data1['articleBody'].apply(lambda x:' '.join(re.findall('[\w]+',x)))


In [None]:
Train_data1.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0


In [None]:
Train_data1['Stance'].unique()

array(['Agree', 'Disagree', 'Unrelated', 'Discuss'], dtype=object)

___

# <font color = red> Base Line Model

## <font color = blue> Creating 2 Class Data set.
- <font color = black> <b> Dividing Data set into 2 Class Unrealted and Related(Agree, Disagree and Discuss).
- Model will first check for similarilty between the user Input Claim with the Scrapped Articles and see whether they are realted or Unrealted.
- If the we are not able to find majority of articles realted to the Claim, User will prompt with the message that claim is unrelated and maximum no. of news agencies does not have anything related to the Topic.

In [None]:
# Creating A column name Stane_Base to Divide Data set into 2 Class (Related Aand Unrelated)

#Train_data1['stance_base'] = Train_data1.loc[Train_data1.loc[:,'Stance']=='unrelated','Stance']
Train_data1['stance_base'] = Train_data1['Stance']


In [None]:
Train_data1.replace({'stance_base':{'Agree':'related', 'Disagree':'related' ,'Discuss':'related'}}, inplace=True)
print(Train_data1['stance_base'].value_counts())

related      131
Unrelated     81
Name: stance_base, dtype: int64


In [None]:
Train_data1.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related


## <font color = blue> Calculating Jacard Similarity

In [None]:
# Creating A Function to calculate the Jaccard Similarity

def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
def add_jaccard_similarity(data):
    count=0
    for i in range(data.shape[0]):
        jaccard_lis=[];eps=0.001
        sentence = data.loc[i,'articleBody'].split('.') #per sentence scorer
        for j in range(len(sentence)):
            jaccard_lis.append(jaccard_similarity(data.loc[i,'Headline'].split(' '),sentence[j].split(' ')))
        max_jaccard_similarity = max(jaccard_lis)
        avg_jaccard_similarity = sum(jaccard_lis)/len(jaccard_lis)
        min_jaccard_similarity = min(jaccard_lis)
        data.loc[i,'jaccard_similarity'] = (max_jaccard_similarity+min_jaccard_similarity)/(max_jaccard_similarity-min_jaccard_similarity+eps)
        if i%1000==0:
            count+=1
            print("Processed {0} Headlines".format(count*1000))


In [None]:
add_jaccard_similarity(Train_data1)

Processed 1000 Headlines


In [None]:
Train_data1.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458


In [None]:
Train_data1[Train_data1["stance_base"]=="unrelated"]["jaccard_similarity"].value_counts()

Series([], Name: jaccard_similarity, dtype: int64)

In [None]:
len(Train_data1[Train_data1["stance_base"]=="unrelated"]["jaccard_similarity"])

0

In [None]:
#Train_data1[Train_data1["stance_base"]=="unrelated"].describe()

In [None]:
sentence = Train_data1.loc[1,'articleBody'].split('.')

In [None]:
sentence[0]

'ani pretoria former south african cricketer ab de villiers wife danielle de villiers welcomed baby girl righthanded batsman announced thursday de villiers mainstay royal challengers bangalore rcb batting line recently concluded ipl shared picture instagram revealed welcomed daughter november heartwarming picture shared de villers seen along wife daughter duo share smile newest addition de villiers family named yente de villiers welcomed beautiful baby girl world yente de villiers perfect addition family blessing grateful beyond measure genade onbeskryklik groot dankie de villiers captioned post edition ipl rcb bowed tournament losing sunrisers hyderabad srh eliminator righthanded batsman always lone man standing virat kohliled side showpiece event de villiers smashed runs strike rate rcb also registered five halfcenturies ipl'

In [None]:
jaccard_lis=[]
for j in range(len(sentence)):
    jaccard_lis.append(jaccard_similarity(Train_data1.loc[2,'Headline'].split(' '),sentence[j].split(' ')))

In [None]:
jaccard_lis

[0.10588235294117647]

In [None]:
for j in range(len(sentence)):
    print(sentence[j].split(' '))

['ani', 'pretoria', 'former', 'south', 'african', 'cricketer', 'ab', 'de', 'villiers', 'wife', 'danielle', 'de', 'villiers', 'welcomed', 'baby', 'girl', 'righthanded', 'batsman', 'announced', 'thursday', 'de', 'villiers', 'mainstay', 'royal', 'challengers', 'bangalore', 'rcb', 'batting', 'line', 'recently', 'concluded', 'ipl', 'shared', 'picture', 'instagram', 'revealed', 'welcomed', 'daughter', 'november', 'heartwarming', 'picture', 'shared', 'de', 'villers', 'seen', 'along', 'wife', 'daughter', 'duo', 'share', 'smile', 'newest', 'addition', 'de', 'villiers', 'family', 'named', 'yente', 'de', 'villiers', 'welcomed', 'beautiful', 'baby', 'girl', 'world', 'yente', 'de', 'villiers', 'perfect', 'addition', 'family', 'blessing', 'grateful', 'beyond', 'measure', 'genade', 'onbeskryklik', 'groot', 'dankie', 'de', 'villiers', 'captioned', 'post', 'edition', 'ipl', 'rcb', 'bowed', 'tournament', 'losing', 'sunrisers', 'hyderabad', 'srh', 'eliminator', 'righthanded', 'batsman', 'always', 'lone',

In [None]:
Train_data1.loc[1,'Headline'].split(' ')

['ab',
 'de',
 'villiers',
 'wife',
 'danielle',
 'welcomed',
 'newborn',
 'daughter',
 'yente',
 'family']

In [None]:
sentence[0]

'ani pretoria former south african cricketer ab de villiers wife danielle de villiers welcomed baby girl righthanded batsman announced thursday de villiers mainstay royal challengers bangalore rcb batting line recently concluded ipl shared picture instagram revealed welcomed daughter november heartwarming picture shared de villers seen along wife daughter duo share smile newest addition de villiers family named yente de villiers welcomed beautiful baby girl world yente de villiers perfect addition family blessing grateful beyond measure genade onbeskryklik groot dankie de villiers captioned post edition ipl rcb bowed tournament losing sunrisers hyderabad srh eliminator righthanded batsman always lone man standing virat kohliled side showpiece event de villiers smashed runs strike rate rcb also registered five halfcenturies ipl'

In [None]:
jaccard_similarity(Train_data1.loc[1,'Headline'].split(' '), sentence[0].split(' '))

0.10588235294117647

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shreyansh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import nltk
lines = 'lines is some string of words'
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if(pos[:2] == 'NN')]
print (nouns)

['lines', 'string', 'words']


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shreyansh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
nltk.pos_tag(tokenized)

[('lines', 'NNS'),
 ('is', 'VBZ'),
 ('some', 'DT'),
 ('string', 'NN'),
 ('of', 'IN'),
 ('words', 'NNS')]

In [None]:
for (word, pos) in nltk.pos_tag(tokenized):
    if(pos[:2] == 'NN'):
        print(word)

lines
string
words


In [None]:
l1  = Train_data1["Headline"]
tokenized1 = nltk.word_tokenize(l1[0])
tokenized1
nouns1 = [word for (word, pos) in nltk.pos_tag(tokenized1) if(pos[:2] == 'NN')]
print(nouns1)

['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family']


In [None]:
headline_nouns = []
for l in l1:
    tokn1 = nltk.word_tokenize(l)
    nouns1 = [word for (word, pos) in nltk.pos_tag(tokn1) if(pos[:2] == 'NN')]
    headline_nouns.append(nouns1)
    

In [None]:
headline_nouns

[['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['ab', 'villiers', 'wife', 'danielle', 'daughter', 'yente', 'family'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['covidshield', 'covaccine'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['april'],
 ['padukone', 'alcohol', 'lockdown'],
 ['pm'

In [None]:
#" ".join(headline_nouns1[0])

In [None]:
headline_nouns1 = []
for i in headline_nouns:
    headline_nouns1.append(" ".join(set(i)))

In [None]:
Train_data2  = Train_data1.copy()

In [None]:
Train_data2["Headline_Nouns"] = headline_nouns1

In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_Nouns
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers family daughter yente danielle wife ab
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers family daughter yente danielle wife ab
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers family daughter yente danielle wife ab
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers family daughter yente danielle wife ab
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers family daughter yente danielle wife ab


In [None]:
# Creating A fucntion

def noun_extraction(df, col):
    l1_nouns = []
    l1  = df[col]
    for l in l1:
        tokn1 = nltk.word_tokenize(l)
        nouns1 = [word for (word, pos) in nltk.pos_tag(tokn1) if(pos[:2] == 'NN')]
        l1_nouns.append(nouns1)

    l1_nouns1 = []
    for i in l1_nouns:
        l1_nouns1.append(" ".join(set(i)))

    df[col + "_Nouns"] = l1_nouns1
    

In [None]:
noun_extraction(Train_data2, "articleBody")

In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_Nouns,articleBody_Nouns
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers family daughter yente danielle wife ab,royal son bash girl stream danielle sons addit...
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers family daughter yente danielle wife ab,news pandemic premier fans son bash girl strea...


In [None]:
# Calculating Jaccrd Similarity for Two Columns With Only Nouns

def add_jaccard_similarity_noun(data):
    count=0
    for i in range(data.shape[0]):
        jaccard_lis=[];eps=0.001
        sentence = data.loc[i,'articleBody_Nouns'].split('.') #per sentence scorer
        for j in range(len(sentence)):
            jaccard_lis.append(jaccard_similarity(data.loc[i,'Headline_Nouns'].split(' '),sentence[j].split(' ')))
        max_jaccard_similarity = max(jaccard_lis)
        avg_jaccard_similarity = sum(jaccard_lis)/len(jaccard_lis)
        min_jaccard_similarity = min(jaccard_lis)
        data.loc[i,'jaccard_similarity_nouns'] = (max_jaccard_similarity+min_jaccard_similarity)/(max_jaccard_similarity-min_jaccard_similarity+eps)
        if i%1000==0:
            count+=1
            print("Processed {0} Headlines_nouns".format(count*1000))


In [None]:
add_jaccard_similarity_noun(Train_data2)

Processed 1000 Headlines_nouns


In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_Nouns,articleBody_Nouns,jaccard_similarity_nouns
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers family daughter yente danielle wife ab,royal son bash girl stream danielle sons addit...,285.714286
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,315.789474
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,307.692308
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,324.324324
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers family daughter yente danielle wife ab,news pandemic premier fans son bash girl strea...,190.47619


In [None]:
Train_data2[Train_data2["stance_base"]=="unrelated"].describe()

Unnamed: 0,stance_cat,jaccard_similarity,jaccard_similarity_nouns
count,0.0,0.0,0.0
mean,,,
std,,,
min,,,
25%,,,
50%,,,
75%,,,
max,,,


In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_Nouns,articleBody_Nouns,jaccard_similarity_nouns
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers family daughter yente danielle wife ab,royal son bash girl stream danielle sons addit...,285.714286
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,315.789474
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,307.692308
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers family daughter yente danielle wife ab,duo villers girl addition groot ipl villiers e...,324.324324
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers family daughter yente danielle wife ab,news pandemic premier fans son bash girl strea...,190.47619


In [None]:
Train_data2.columns

Index(['Headline', 'Stance', 'articleBody', 'stance_cat', 'stance_base',
       'jaccard_similarity', 'Headline_Nouns', 'articleBody_Nouns',
       'jaccard_similarity_nouns'],
      dtype='object')

In [None]:
#Train_data2.to_csv("real_test_base1.csv")

___

# <font color = red> Predicting 2 Classes

## <font color = blue>  Jaccard Similarity

In [None]:
Train_data2['stance_base'].value_counts()

related      131
Unrelated     81
Name: stance_base, dtype: int64

In [None]:
x = Train_data2['jaccard_similarity']  # jaacard similarity as predictor
y = Train_data2['stance_base']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1,random_state= 101 )


In [None]:
ytrain

42     Unrelated
112      related
121      related
66       related
183      related
         ...    
63     Unrelated
70     Unrelated
81       related
11       related
95       related
Name: stance_base, Length: 190, dtype: object

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (190,)
Y Training shape (190,)


In [None]:
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
rg.fit(xtrain,ytrain)
ypred = rg.predict(xtest)              
print('Accuracy score on two class agree and disagree ',accuracy_score(ypred,ytest))  # test accuracy

Accuracy score on two class agree and disagree  0.7272727272727273


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(ypred,ytest))   # evaluation
print(confusion_matrix(ypred,ytest))

              precision    recall  f1-score   support

   Unrelated       0.56      0.71      0.63         7
     related       0.85      0.73      0.79        15

    accuracy                           0.73        22
   macro avg       0.70      0.72      0.71        22
weighted avg       0.75      0.73      0.73        22

[[ 5  2]
 [ 4 11]]


## <font color = blue>  Jaccard Similarity Noun

In [None]:
Train_data2['stance_base'].value_counts()

related      131
Unrelated     81
Name: stance_base, dtype: int64

In [None]:
x = Train_data2.iloc[:,-1]  # jaccard similarity noun as predictor
y = Train_data2['stance_base']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1, random_state = 101)
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
ytrain

42     Unrelated
112      related
121      related
66       related
183      related
         ...    
63     Unrelated
70     Unrelated
81       related
11       related
95       related
Name: stance_base, Length: 190, dtype: object

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (190,)
Y Training shape (190,)


In [None]:
rg.fit(xtrain,ytrain)
ypred_noun = rg.predict(xtest)
print('Accuracy score on two class agree and disagree ',accuracy_score(ypred_noun,ytest))

Accuracy score on two class agree and disagree  0.7727272727272727


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(ypred_noun,ytest))
print(confusion_matrix(ypred_noun,ytest))

              precision    recall  f1-score   support

   Unrelated       0.56      0.83      0.67         6
     related       0.92      0.75      0.83        16

    accuracy                           0.77        22
   macro avg       0.74      0.79      0.75        22
weighted avg       0.82      0.77      0.78        22

[[ 5  1]
 [ 4 12]]


## <font color =  blue> Combined

In [None]:
Train_data2['stance_base'].value_counts()

related      131
Unrelated     81
Name: stance_base, dtype: int64

In [None]:
Train_data2["jaccard_similarity"].shape

(212,)

In [None]:
Train_data2["jaccard_similarity_nouns"].shape

(212,)

In [None]:
df = Train_data2[["jaccard_similarity", "jaccard_similarity_nouns", "stance_base"]]
df

Unnamed: 0,jaccard_similarity,jaccard_similarity_nouns,stance_base
0,168.224299,285.714286,related
1,211.764706,315.789474,related
2,211.764706,307.692308,related
3,216.867470,324.324324,related
4,137.404580,190.476190,related
...,...,...,...
207,194.444444,157.894737,Unrelated
208,53.231939,75.757576,Unrelated
209,86.956522,136.986301,Unrelated
210,48.387097,57.553957,Unrelated


In [None]:
jaccard_similarity_nouns

In [None]:
len(df)

212

In [None]:
df["stance_base"].shape

(212,)

In [None]:
x = df.iloc[:,:2]
y = df['stance_base']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1, random_state = 101)
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
xtrain.shape

(190, 2)

In [None]:
ytrain

42     Unrelated
112      related
121      related
66       related
183      related
         ...    
63     Unrelated
70     Unrelated
81       related
11       related
95       related
Name: stance_base, Length: 190, dtype: object

In [None]:
xtest.shape

(22, 2)

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (190, 2)
Y Training shape (190,)


In [None]:
rg.fit(xtrain,ytrain)
ypred_combined = rg.predict(xtest)
print('Accuracy score on two class agree and disagree ',accuracy_score(ypred_combined,ytest))

ValueError: Number of labels=190 does not match number of samples=380

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(ypred,ytest))
print(confusion_matrix(ypred,ytest))

              precision    recall  f1-score   support

   Unrelated       0.56      0.71      0.63         7
     related       0.85      0.73      0.79        15

    accuracy                           0.73        22
   macro avg       0.70      0.72      0.71        22
weighted avg       0.75      0.73      0.73        22

[[ 5  2]
 [ 4 11]]


In [None]:
Train_data2  =  pd.read_csv(r"C:\Users\Shreyansh\Capstone_files\final_train_data.csv")

In [None]:
Train_data2.head()

Unnamed: 0.1,Unnamed: 0,Headline,Body ID,articleBody,jaccard_similarity,Headline_Nouns,articleBody_Nouns,jaccard_similarity_nouns,Headline_POS,articleBody_POS,Stance,glove_similarity,kl_divergence,ngram_overlap,semantic_similarity,stance_base
0,0,police find mass graves least bodies near mexi...,712,danny boyle directing untitled film seth rogen...,0.0,town police mass graves bodies students clash,jobs gordon talks mark computers assistant off...,0.0,town police mexico least find mass disappeared...,actresses mark assistant offer walter film cre...,unrelated,0.656144,1.91531,3.36178,0.167548,unrelated
1,1,hundreds palestinians flee floods gaza israel ...,158,hundreds palestinians evacuated homes sunday m...,82.840237,dams opens hundreds floods palestinians,families years lack hazard people morning open...,129.87013,dams flee hundreds gaza opens israel floods pa...,prevented save families wake rain israeli drop...,agree,0.84872,0.567896,2.895936,0.409437,related
2,2,christian bale passes role steve jobs actor re...,137,moscow resident hospitalized wounds intimate n...,0.0,jobs bale part role passes,criminals actions adventures precision officia...,0.0,jobs right bale christian steve part role pass...,criminals actions spend convinced adventures e...,unrelated,0.705327,1.976065,3.36178,0.23382,unrelated
3,3,hbo apple talks apple tv streaming service lau...,1034,reuters canadian soldier shot canadian war mem...,0.0,tv apple hbo talks service april,police war amran towards teams soldiers hopkin...,0.0,tv apple hbo talks streaming service launching...,seen police fired war amran lockdown soldier c...,unrelated,0.687689,2.065698,2.895936,0.12345,unrelated
4,4,spider burrowed tourist stomach chest,1923,fear arachnophobes story bunbury spiderman mig...,28.169014,spider stomach chest,dig mark somebody humans spider mites bodies p...,41.237113,tourist stomach chest spider burrowed,seen dig mark somebody humans spider mites bod...,disagree,0.584935,0.379117,2.244121,0.510268,related


# LDA

In [None]:



def noun_extraction_pos(df, col):
    tags_to_keep = ['JJ', 'NN', 'NNP', 'NNPS', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    
    l1_nouns = []
    l1  = df[col]
    for l in l1:
        tokn1 = nltk.word_tokenize(l)
        nouns1 = [word for (word, pos) in nltk.pos_tag(tokn1) if(pos in tags_to_keep)]
        l1_nouns.append(nouns1)

    l1_nouns1 = []
    for i in l1_nouns:
        l1_nouns1.append(" ".join(set(i)))
    df[f"{col}_pos"] = l1_nouns1


In [None]:
noun_extraction_pos(Train_data2, "Headline")

In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_pos
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers welcomed newborn family daughter yent...
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers welcomed newborn family daughter yent...
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers welcomed newborn family daughter yent...
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers welcomed newborn family daughter yent...
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers welcomed newborn family daughter yent...


In [None]:
noun_extraction_pos(Train_data2, "articleBody")

In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_pos,articleBody_pos
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers welcomed newborn family daughter yent...,welcomed bangalore photo addition parents succ...
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers welcomed newborn family daughter yent...,pulled welcomed fans postponed bangalore india...


In [None]:
Train_data2['jaccard_similarity_nouns'] = df['jaccard_similarity_nouns']

In [None]:
Train_data2.columns

Index(['Headline', 'Stance', 'articleBody', 'stance_cat', 'stance_base',
       'jaccard_similarity', 'Headline_pos', 'articleBody_pos',
       'jaccard_similarity_nouns'],
      dtype='object')

In [None]:
Train_data3 = Train_data2.copy()

In [None]:
Train_data2["Headline_Nouns"] = headline_nouns1   # consider Headline nouns

In [None]:
Train_data2.columns

Index(['Headline', 'Stance', 'articleBody', 'stance_cat', 'stance_base',
       'jaccard_similarity', 'Headline_pos', 'articleBody_pos',
       'jaccard_similarity_nouns', 'Headline_Nouns'],
      dtype='object')

In [None]:
def lemmatization(data,col):   # lemmatization of words
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    def lemmatize_text(text):
        return [lemmatizer.lemmatize(w , pos = 'v') for w in w_tokenizer.tokenize(text)]

    df = pd.DataFrame(data[col])
 
    data[col +'_lemmatized'] = df[col].apply(lemmatize_text)
    
    l = []
    for i in data[col +'_lemmatized']:
        l.append(" ".join(i))
    data[col +'_lemmatized'] = l

    

In [None]:
lemmatization(Train_data2 ,"articleBody_pos" )

In [None]:
lemmatization(Train_data2 ,"Headline_pos" )

In [None]:
Train_data2.head()

Unnamed: 0,Headline,Stance,articleBody,stance_cat,stance_base,jaccard_similarity,Headline_pos,articleBody_pos,jaccard_similarity_nouns,Headline_Nouns,articleBody_pos_lemmatized,Headline_pos_lemmatized
0,ab de villiers wife danielle welcomed newborn ...,Agree,south africa royal challengers bangalore batsm...,0,related,168.224299,villiers welcomed newborn family daughter yent...,welcomed bangalore photo addition parents succ...,285.714286,villiers family daughter yente danielle wife ab,welcome bangalore photo addition parent succes...,villiers welcome newborn family daughter yente...
1,ab de villiers wife danielle welcomed newborn ...,Agree,ani pretoria former south african cricketer ab...,0,related,211.764706,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...,315.789474,villiers family daughter yente danielle wife ab,smile welcome royal duo villers girl stand ban...,villiers welcome newborn family daughter yente...
2,ab de villiers wife danielle welcomed newborn ...,Agree,pretoria former south african cricketer ab de ...,0,related,211.764706,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...,307.692308,villiers family daughter yente danielle wife ab,smile welcome royal duo villers girl stand ban...,villiers welcome newborn family daughter yente...
3,ab de villiers wife danielle welcomed newborn ...,Agree,former south african cricketer ab de villiers ...,0,related,216.86747,villiers welcomed newborn family daughter yent...,smile welcomed royal duo villers girl standing...,324.324324,villiers family daughter yente danielle wife ab,smile welcome royal duo villers girl stand ban...,villiers welcome newborn family daughter yente...
4,ab de villiers wife danielle welcomed newborn ...,Agree,former south african skipper ab de villiers wi...,0,related,137.40458,villiers welcomed newborn family daughter yent...,pulled welcomed fans postponed bangalore india...,190.47619,villiers family daughter yente danielle wife ab,pull welcome fan postpone bangalore indian int...,villiers welcome newborn family daughter yente...


In [None]:
# DATASET_PATH = os.path.join(os.getcwd(),"C:\\Users\\Shreyansh\\Abhinav\data","")

# Train_data2 = pd.read_csv(DATASET_PATH+"Train_data2_combined.csv",encoding='ISO-8859-1')

In [None]:
Train_data2.columns

Index(['Headline', 'Stance', 'articleBody', 'stance_cat', 'stance_base',
       'jaccard_similarity', 'Headline_pos', 'articleBody_pos',
       'jaccard_similarity_nouns', 'Headline_Nouns',
       'articleBody_pos_lemmatized', 'Headline_pos_lemmatized'],
      dtype='object')

In [None]:
nf = Train_data2[["Headline_POS" , "articleBody_POS"]] 

In [None]:
filtered_texts = nf["Headline_POS"]

In [None]:
def LDA_Conversion_Matrix(column_name , no_of_topics):
    #modules to build the topic extracting models
    from gensim import corpora, models
    import gensim
    import pyLDAvis.gensim
    pyLDAvis.enable_notebook()
    import pandas as pd
    import matplotlib.pyplot as plt
    
    filtered_texts = list(column_name)
    #creating the dictionary
    dictionary = corpora.Dictionary([" ".join(filtered_texts).split()]) 
    #print('{} different terms in the corpus'.format(len(dictionary)))
    
    #creating the bag of words object
    
    
    bow_corpus = [dictionary.doc2bow(text.split()) for text in filtered_texts]
    tfidf_model = models.TfidfModel(bow_corpus) # creating the tf-idf model
    tfidf_corpus = tfidf_model[bow_corpus]
    
    total_topics = no_of_topics
    lda_model_tfidf = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=total_topics, 
                                  passes=1, random_state=47)
    lda_model_bow = models.LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=total_topics,
                                passes=1, random_state=47)
    
    
    data = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary)
    #print(data)
    
    
    new_basket = []   
    for j in filtered_texts:
        z = j
        vec_bow = dictionary.doc2bow(z.split())
        new_basket.append(vec_bow)
        
        
    vec_lda_topics_basket = []
    for i in new_basket:
        vec_lda_topics = lda_model_bow[i]
        vec_lda_topics_basket.append(vec_lda_topics)
    vec_lda_topics_basket
    
    
    
    #df = pd.DataFrame(vec_lda_topics_basket)
    #dff = df.stack().unstack(fill_value=(0, 0))
    
    return vec_lda_topics_basket


In [None]:
body_LDA = LDA_Conversion_Matrix( nf["articleBody_POS"] , 25)  # LDA score is calculated for (articles Parts of speech tagged ) and articles POS is formed into 25 clusters

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


In [None]:
headline_LDA = LDA_Conversion_Matrix( nf["Headline_POS"] , 25)  # LDA score is calculated for (Headline Parts of speech tagged ) and Headline POS is formed into 25 clusters

In [None]:
len(body_LDA)

75385

In [None]:
len(headline_LDA)

75385

In [None]:
body_LDA[1]

[(1, 0.014937095),
 (5, 0.1084584),
 (6, 0.024180355),
 (7, 0.09436764),
 (8, 0.06832374),
 (13, 0.21895494),
 (16, 0.05352859),
 (17, 0.03672689),
 (20, 0.013868878),
 (21, 0.36266696)]

In [None]:
headline_LDA[1]

[(22, 0.893318)]

In [None]:
import gensim
from gensim import corpora
from gensim import models
from gensim.models import LdaModel
from gensim.models import TfidfModel
from gensim.models import CoherenceModel

In [None]:
masket = []
for i in range(len(headline_LDA)):
    simmilarity = gensim.matutils.cossim(headline_LDA[i], body_LDA[i])
    masket.append(simmilarity)
len(masket)

75385

In [None]:
masket

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.04774977550043799,
 0.0,
 0.0,
 0.0,
 0.0,
 0.028769211429950318,
 0.021454825665785257,
 0.0,
 0.13060725232795917,
 0.05880060755373019,
 0.0,
 0.0,
 0.0,
 0.3074746110427348,
 0.0,
 0.0,
 0.0,
 0.1445734933343478,
 0.4725068205964438,
 0.0,
 0.0,
 0.006564476197540084,
 0.31335646366249464,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.15163348233466642,
 0.0,
 0.2641518059804959,
 0.0,
 0.0,
 0.045453568964596514,
 0.3310114166387529,
 0.0912259586175136,
 0.013131234540366047,
 0.35956765163859533,
 0.0,
 0.0,
 0.05087036306988893,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.09398482580834541,
 0.06287793874294252,
 0.021326382231501743,
 0.0,
 0.15931861478477732,
 0.02183246228153673,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.958269645396889,
 0.0,
 0.0,
 0.0,
 0.01466607341002623,
 0.2314524996104179,
 0.0,
 0.0,
 0.0,
 0.027718631717102704,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.01742209873230496,
 0.0,
 0.0,
 0.8782162731032785,
 0.0,
 0.0,
 0.08555787894430515,
 

In [None]:
Train_data2["LDA_Score"] = pd.DataFrame(masket)
Train_data2.columns

Index(['Unnamed: 0', 'Headline', 'Body ID', 'articleBody',
       'jaccard_similarity', 'Headline_Nouns', 'articleBody_Nouns',
       'jaccard_similarity_nouns', 'Headline_POS', 'articleBody_POS', 'Stance',
       'glove_similarity', 'kl_divergence', 'ngram_overlap',
       'semantic_similarity', 'stance_base', 'LDA_Score'],
      dtype='object')

In [None]:
Train_data_new  = Train_data2[["Headline" ,"articleBody" ,"stance_base" ,"LDA_Score" , "jaccard_similarity", "jaccard_similarity_nouns"]]
Train_data_new.head(5)

Unnamed: 0,Headline,articleBody,stance_base,LDA_Score,jaccard_similarity,jaccard_similarity_nouns
0,police find mass graves least bodies near mexi...,danny boyle directing untitled film seth rogen...,unrelated,0.0,0.0,0.0
1,hundreds palestinians flee floods gaza israel ...,hundreds palestinians evacuated homes sunday m...,related,0.0,82.840237,129.87013
2,christian bale passes role steve jobs actor re...,moscow resident hospitalized wounds intimate n...,unrelated,0.0,0.0,0.0
3,hbo apple talks apple tv streaming service lau...,reuters canadian soldier shot canadian war mem...,unrelated,0.0,0.0,0.0
4,spider burrowed tourist stomach chest,fear arachnophobes story bunbury spiderman mig...,related,0.0,28.169014,41.237113


In [None]:
Train_data2.columns

Index(['Unnamed: 0', 'Headline', 'Body ID', 'articleBody',
       'jaccard_similarity', 'Headline_Nouns', 'articleBody_Nouns',
       'jaccard_similarity_nouns', 'Headline_POS', 'articleBody_POS', 'Stance',
       'glove_similarity', 'kl_divergence', 'ngram_overlap',
       'semantic_similarity', 'stance_base', 'LDA_Score'],
      dtype='object')

In [None]:
Train_data2.shape

(75385, 17)

In [None]:
#Train_data2.to_csv("final_train_data.csv")

In [None]:
x = Train_data2[['LDA_Score',"jaccard_similarity", "jaccard_similarity_nouns"]]
y = Train_data2['stance_base']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1,random_state= 101 )


In [None]:
ytrain

42     Unrelated
112      related
121      related
66       related
183      related
         ...    
63     Unrelated
70     Unrelated
81       related
11       related
95       related
Name: stance_base, Length: 190, dtype: object

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

X Training shape (190, 3)
Y Training shape (190,)


In [None]:
rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
# rg.fit(xtrain,ytrain)
# ypred = rg.predict(xtest)
# print('Accuracy score on two class agree and disagree ',accuracy_score(ypred,ytest))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(ypred,ytest))
print(confusion_matrix(ypred,ytest))

              precision    recall  f1-score   support

   Unrelated       0.56      0.71      0.63         7
     related       0.85      0.73      0.79        15

    accuracy                           0.73        22
   macro avg       0.70      0.72      0.71        22
weighted avg       0.75      0.73      0.73        22

[[ 5  2]
 [ 4 11]]


In [None]:
#filtered_texts = pd.DataFrame(filtered_texts)

In [None]:
filtered_texts = list(filtered_texts)

In [None]:
filtered_texts

['town police mexico least find mass disappeared graves bodies students clash',
 'dams flee hundreds gaza opens israel floods palestinians',
 'jobs right bale christian steve part role passes felt nt actor',
 'tv apple hbo talks streaming service launching april',
 'tourist stomach chest spider burrowed',
 'goes confirms news total december experience days story viral earth fake darkness',
 'accused prison marathon walk bomber talk injured boston',
 'revealed terrorist jihadi identity john known isis',
 'revealed hoax arrested identity year last real',
 'aid murdered worker isis british confirmed',
 'pundit gateway',
 'woman wife albaghdadi lebanon detained iraq says',
 'schoolgirls ceasefire nigerian government deal haram home girls missing bring claims kidnapped boko',
 'kid high make school trading stocks',
 'york arrest new marijuana lead ticket',
 'add turned vandals job paint hoax bugatti owner rude',
 'denies boko ceasefire nigeria claim haram',
 'rip plant robert contract',
 'c

### Building the models to find correlated concepts

In [None]:
#modules to build the topic extracting models
from gensim import corpora, models
import gensim
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#creating the dictionary
dictionary = corpora.Dictionary([" ".join(filtered_texts).split()]) 
print('{} different terms in the corpus'.format(len(dictionary)))
#creating the bag of words object
bow_corpus = [dictionary.doc2bow(text.split()) for text in filtered_texts]

3070 different terms in the corpus


In [None]:
len(bow_corpus)

49972

In [None]:
tfidf_model = models.TfidfModel(bow_corpus) # creating the tf-idf model
tfidf_corpus = tfidf_model[bow_corpus]

### Building the LDA models.

In [None]:
total_topics = 50
lda_model_tfidf = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=total_topics, 
                                  passes=1, random_state=47)
lda_model_bow = models.LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=total_topics,
                                passes=1, random_state=47)

In [None]:
data = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary)
data

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


### Calculating the topics for documents 03, 12 and 17

In [None]:
#basket = []
#for i in filtered_texts:
    #basket.append(i)
new_basket = []   
for j in filtered_texts:
    z = j
    vec_bow = dictionary.doc2bow(z.split())
    new_basket.append(vec_bow)
    
new_basket[17]
    
    

[(600, 1), (1956, 1), (2238, 1), (2248, 1)]

In [None]:
y = 'didnt robert rip plant contract'

In [None]:
y = filtered_texts[17]

In [None]:
vec_bow_179 = dictionary.doc2bow(y.split())

In [None]:
vec_lda_topics_basket = []
for i in new_basket:
    vec_lda_topics = lda_model_bow[i]
    vec_lda_topics_basket.append(vec_lda_topics)
vec_lda_topics_basket    

[[(27, 0.3348677), (38, 0.58512986)],
 [(16, 0.78001714), (29, 0.113311775)],
 [(7, 0.5679284), (9, 0.3520707)],
 [(14, 0.89111036)],
 [(40, 0.83666575)],
 [(19, 0.9246135)],
 [(33, 0.8911098)],
 [(6, 0.12749125), (12, 0.16246559), (32, 0.2524995), (43, 0.3425408)],
 [(20, 0.5015746), (32, 0.37842375)],
 [(4, 0.15875272), (15, 0.55964375), (19, 0.1473157)],
 [(0, 0.02),
  (1, 0.02),
  (2, 0.02),
  (3, 0.02),
  (4, 0.02),
  (5, 0.02),
  (6, 0.02),
  (7, 0.02),
  (8, 0.02),
  (9, 0.02),
  (10, 0.02),
  (11, 0.02),
  (12, 0.02),
  (13, 0.02),
  (14, 0.02),
  (15, 0.02),
  (16, 0.02),
  (17, 0.02),
  (18, 0.02),
  (19, 0.02),
  (20, 0.02),
  (21, 0.02),
  (22, 0.02),
  (23, 0.02),
  (24, 0.02),
  (25, 0.02),
  (26, 0.02),
  (27, 0.02),
  (28, 0.02),
  (29, 0.02),
  (30, 0.02),
  (31, 0.02),
  (32, 0.02),
  (33, 0.02),
  (34, 0.02),
  (35, 0.02),
  (36, 0.02),
  (37, 0.02),
  (38, 0.02),
  (39, 0.02),
  (40, 0.02),
  (41, 0.02),
  (42, 0.02),
  (43, 0.02),
  (44, 0.02),
  (45, 0.02),
  (46,

In [None]:
vec_bow_17 = dictionary.doc2bow(filtered_texts[17].split())  # creating corpus
vec_bow_03 = dictionary.doc2bow(filtered_texts[3].split())
vec_bow_12 = dictionary.doc2bow(filtered_texts[12].split())

In [None]:
vec_bow_17

[(600, 1), (1956, 1), (2238, 1), (2248, 1)]

### Viewing the topics of the selected documents

In [None]:
vec_lda_topics_17 = lda_model_bow[vec_bow_17]
vec_lda_topics_03 = lda_model_bow[vec_bow_03]
vec_lda_topics_12 = lda_model_bow[vec_bow_12]
print ('document 03 topics: ', vec_lda_topics_03)
print ('document 12 topics: ', vec_lda_topics_12)
print ('document 17 topics: ', vec_lda_topics_17)

document 03 topics:  [(14, 0.89111036)]
document 12 topics:  [(39, 0.9299994)]
document 17 topics:  [(9, 0.8039964)]


In [None]:
df1 = pd.DataFrame(vec_lda_topics_03, columns=['topic', 'contrib'])
df1['doc'] = 'doc_03'
df2 = pd.DataFrame(vec_lda_topics_12, columns=['topic', 'contrib'])
df2['doc'] = 'doc_12'
df3 = pd.DataFrame(vec_lda_topics_17, columns=['topic', 'contrib'])
df3['doc'] = 'doc_17'

In [None]:
dff = dff.stack().unstack(fill_value=(0, 0))

In [None]:
#vec_lda_topics_basket

In [None]:
dff = pd.DataFrame(vec_lda_topics_basket)
dff.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,"(27, 0.3348677)","(38, 0.58512986)",,,,,,,,,...,,,,,,,,,,
1,"(16, 0.78001714)","(29, 0.113311775)",,,,,,,,,...,,,,,,,,,,
2,"(7, 0.5679284)","(9, 0.3520707)",,,,,,,,,...,,,,,,,,,,
3,"(14, 0.89111036)",,,,,,,,,,...,,,,,,,,,,
4,"(40, 0.83666575)",,,,,,,,,,...,,,,,,,,,,
5,"(19, 0.9246135)",,,,,,,,,,...,,,,,,,,,,
6,"(33, 0.8911098)",,,,,,,,,,...,,,,,,,,,,
7,"(6, 0.12749125)","(12, 0.16246559)","(32, 0.2524995)","(43, 0.3425408)",,,,,,,...,,,,,,,,,,
8,"(20, 0.5015746)","(32, 0.37842375)",,,,,,,,,...,,,,,,,,,,
9,"(4, 0.15875272)","(15, 0.55964375)","(19, 0.1473157)",,,,,,,,...,,,,,,,,,,


### Calculating the simmilarity between the documents
Documents 3 and 17 are closer to one another than 3 is to 12 or 12 is to 17.

In [None]:
dff.iloc[5]

0     (19, 0.9246135)
1              (0, 0)
2              (0, 0)
3              (0, 0)
4              (0, 0)
5              (0, 0)
6              (0, 0)
7              (0, 0)
8              (0, 0)
9              (0, 0)
10             (0, 0)
11             (0, 0)
12             (0, 0)
13             (0, 0)
14             (0, 0)
15             (0, 0)
16             (0, 0)
17             (0, 0)
18             (0, 0)
19             (0, 0)
20             (0, 0)
21             (0, 0)
22             (0, 0)
23             (0, 0)
24             (0, 0)
25             (0, 0)
26             (0, 0)
27             (0, 0)
28             (0, 0)
29             (0, 0)
30             (0, 0)
31             (0, 0)
32             (0, 0)
33             (0, 0)
34             (0, 0)
35             (0, 0)
36             (0, 0)
37             (0, 0)
38             (0, 0)
39             (0, 0)
40             (0, 0)
41             (0, 0)
42             (0, 0)
43             (0, 0)
44             (0, 0)
45        

In [None]:
list(dff.iloc[3])

[(14, 0.89111036),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

In [None]:
simmilarity_03_12 = gensim.matutils.cossim(vec_lda_topics_basket[3], vec_lda_topics_basket[3])
#simmilarity_03_17 = gensim.matutils.cossim(vec_lda_topics_03, vec_lda_topics_17)
#simmilarity_12_17 = gensim.matutils.cossim(vec_lda_topics_12, vec_lda_topics_17)

In [None]:
print('similarity between docs 03 and 12', simmilarity_03_12)
#print('similarity between docs 03 and 17', simmilarity_03_17)
#print('similarity between docs 12 and 17', simmilarity_12_17)

similarity between docs 03 and 12 1.0


In [None]:
null_frame = dff.copy()

In [None]:
null_frame2 = pd.DataFrame(columns=dff.columns, index=dff.index)

In [None]:
vec_lda_topics_basket

[[(27, 0.3348677), (38, 0.58512986)],
 [(16, 0.78001714), (29, 0.113311775)],
 [(7, 0.5679284), (9, 0.3520707)],
 [(14, 0.89111036)],
 [(40, 0.83666575)],
 [(19, 0.9246135)],
 [(33, 0.8911098)],
 [(6, 0.12749125), (12, 0.16246559), (32, 0.2524995), (43, 0.3425408)],
 [(20, 0.5015746), (32, 0.37842375)],
 [(4, 0.15875272), (15, 0.55964375), (19, 0.1473157)],
 [(0, 0.02),
  (1, 0.02),
  (2, 0.02),
  (3, 0.02),
  (4, 0.02),
  (5, 0.02),
  (6, 0.02),
  (7, 0.02),
  (8, 0.02),
  (9, 0.02),
  (10, 0.02),
  (11, 0.02),
  (12, 0.02),
  (13, 0.02),
  (14, 0.02),
  (15, 0.02),
  (16, 0.02),
  (17, 0.02),
  (18, 0.02),
  (19, 0.02),
  (20, 0.02),
  (21, 0.02),
  (22, 0.02),
  (23, 0.02),
  (24, 0.02),
  (25, 0.02),
  (26, 0.02),
  (27, 0.02),
  (28, 0.02),
  (29, 0.02),
  (30, 0.02),
  (31, 0.02),
  (32, 0.02),
  (33, 0.02),
  (34, 0.02),
  (35, 0.02),
  (36, 0.02),
  (37, 0.02),
  (38, 0.02),
  (39, 0.02),
  (40, 0.02),
  (41, 0.02),
  (42, 0.02),
  (43, 0.02),
  (44, 0.02),
  (45, 0.02),
  (46,

In [None]:
vec_lda_topics_basket[0]

[(27, 0.3348677), (38, 0.58512986)]

In [None]:
basket_for_cosine = []
for j in range(len(vec_lda_topics_basket)):
    for i in vec_lda_topics_basket[j]:
        index = vec_lda_topics_basket.index(vec_lda_topics_basket[j])
        add = [index] + list(i)
        basket_for_cosine.append(add)
    
#basket

In [None]:
cosine_matrix = pd.DataFrame(basket_for_cosine, columns =['row_index', 'column_index', 'Score']) 
  

In [None]:
cosine_matrix.to_csv('LDA_frame1.csv') 

In [None]:
cosine_matrix.head()

Unnamed: 0,row_index,column_index,Score
0,0,27,0.3348676860332489
1,0,38,0.5851298570632935
2,1,16,0.7800171375274658
3,1,29,0.1133117750287056
4,2,7,0.5679283738136292


In [None]:
cosine_matrix['row_index']= cosine_matrix['row_index'].map(str)
cosine_matrix['column_index']= cosine_matrix['column_index'].map(str)
cosine_matrix['Score']= cosine_matrix['Score'].map(str)

In [None]:
a = cosine_matrix.copy()
b = a.drop_duplicates()
b.shape

In [None]:
b.head()

In [None]:
len(basket)

In [None]:
x = [1,2]
y = [3,4]
x + y

In [None]:
null_frame2

In [None]:
df.to_csv(index=False)

In [None]:
nf.to_csv(r"C:\Users\Shreyansh\Abhinav\feature_selection_matrix.csv", encoding='ISO-8859-1')