In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [4]:
import pandas as pd
df = pd.read_csv("data1.csv")
df

Unnamed: 0,lebel,news,authors,len
0,IndiaNonPolitical,🪔,dhisum_dhisum,1
1,IndiaNonPolitical,😍,Kaptaaan,1
2,IndiaNonPolitical,“Tussi Na Jao!” How Do You Convince Indian Par...,Ajaatshatru34,14
3,Coronavirus,"“One Piece"" & “Digimon Adventure (2020)"" anime...",DemiFiendRSA,14
4,IndianCinema,“Masterpiece”…An underwhelming masala film bar...,crazieab,10
...,...,...,...,...
4549,IndiaNonPolitical,"""My parents do not even know what exam I have ...",gary2812,24
4550,IndianFood,"""Breath Fresheners"" at Indian restaurants - wh...",ishikiera,16
4551,IndianCinema,"""Bollywood"" the INDIAN cinema",akshayrana1998,4
4552,IndiaNonPolitical,"""Blockchain Technology Center of Excellence"" I...",Askrypto,8


In [5]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
df2 = df.copy()

In [7]:
df2

Unnamed: 0,lebel,news,authors,len
0,IndiaNonPolitical,🪔,dhisum_dhisum,1
1,IndiaNonPolitical,😍,Kaptaaan,1
2,IndiaNonPolitical,“Tussi Na Jao!” How Do You Convince Indian Par...,Ajaatshatru34,14
3,Coronavirus,"“One Piece"" & “Digimon Adventure (2020)"" anime...",DemiFiendRSA,14
4,IndianCinema,“Masterpiece”…An underwhelming masala film bar...,crazieab,10
...,...,...,...,...
4549,IndiaNonPolitical,"""My parents do not even know what exam I have ...",gary2812,24
4550,IndianFood,"""Breath Fresheners"" at Indian restaurants - wh...",ishikiera,16
4551,IndianCinema,"""Bollywood"" the INDIAN cinema",akshayrana1998,4
4552,IndiaNonPolitical,"""Blockchain Technology Center of Excellence"" I...",Askrypto,8


In [8]:
#search engine ignore stop words such as a,an,the,in so we need to remove this words
#script to remove stop_words
for i in range(len(df2["news"])):
    string =""
    for word in df2["news"][i].split(" "):
        word = word.lower()
        if word not in stop_words:
            string +=word+" "
    df2["news"][i]=string
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
df2  #here we are able to remove all the stopwords

Unnamed: 0,lebel,news,authors,len
0,IndiaNonPolitical,🪔,dhisum_dhisum,1
1,IndiaNonPolitical,😍,Kaptaaan,1
2,IndiaNonPolitical,“tussi na jao!” convince indian parents want m...,Ajaatshatru34,14
3,Coronavirus,"“one piece"" & “digimon adventure (2020)"" anime...",DemiFiendRSA,14
4,IndianCinema,“masterpiece”…an underwhelming masala film bar...,crazieab,10
...,...,...,...,...
4549,IndiaNonPolitical,"""my parents even know exam cleared happy since...",gary2812,24
4550,IndianFood,"""breath fresheners"" indian restaurants - uniqu...",ishikiera,16
4551,IndianCinema,"""bollywood"" indian cinema",akshayrana1998,4
4552,IndiaNonPolitical,"""blockchain technology center excellence"" inau...",Askrypto,8


In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [11]:
#here we need to do normalization of our text with help  of  stemming and Lemmatization 
#stemming is a process of mapping multiple words to a root even though stem is meaning less
#lemmatization is a process of getting context of word by removing all the inflected words derived from it

import nltk
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
for i in range(len(df2["news"])):
    string = ""
    for w in word_tokenize(df2["news"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    df2.at[i, "news"] = string.strip()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
df2    #in 4th entry of data we can see how wasted change to waste

Unnamed: 0,lebel,news,authors,len
0,IndiaNonPolitical,🪔,dhisum_dhisum,1
1,IndiaNonPolitical,😍,Kaptaaan,1
2,IndiaNonPolitical,“ tussi na jao ! ” convince indian parent want...,Ajaatshatru34,14
3,Coronavirus,“ one piece '' & “ digimon adventure ( 2020 ) ...,DemiFiendRSA,14
4,IndianCinema,“ masterpiece ” …an underwhelming masala film ...,crazieab,10
...,...,...,...,...
4549,IndiaNonPolitical,`` my parent even know exam clear happy since ...,gary2812,24
4550,IndianFood,`` breath fresheners '' indian restaurants - u...,ishikiera,16
4551,IndianCinema,`` bollywood '' indian cinema,akshayrana1998,4
4552,IndiaNonPolitical,`` blockchain technology center excellence '' ...,Askrypto,8


In [0]:
news_vectorizer =  CountVectorizer()
news_features =news_vectorizer.fit_transform(df2["news"])  #to change text to vectors

In [14]:
news_features.get_shape()

(4554, 8592)

In [0]:
import numpy as np

In [16]:
def bag_of_words_based_model(row_index,num_similar_items):
    couple_dist =  pairwise_distances(news_features,news_features[row_index])  #siilarity between given news headline and all the news headlines
    indices  = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df0 = pd.DataFrame({'label': df['lebel'][indices].values,
               'headline':df['news'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',df['news'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df0.iloc[1:,]
bag_of_words_based_model(1770, 11)
    
                                                                        # Change the row index for any other queried article
    

headline :  Modi Sarkar ‘loses control’ over fuel prices again!



Unnamed: 0,label,headline,Euclidean similarity with the queried article
1,IndiaNonPolitical,🪔,2.645751
2,IndiaNonPolitical,😍,2.645751
3,IndianCinema,2-2-2-2-2-2,2.645751
4,IndianFood,Ingredients,2.828427
5,indianeconomy,The Looters!,2.828427
6,IndiaNonPolitical,Elite,2.828427
7,IndiaNonPolitical,Can relate,2.828427
8,IndiaNonPolitical,अखबार,2.828427
9,indiansports,Sports,2.828427
10,indianeconomy,Did Modi Govt Step Back On Its Stand of Fuel P...,2.828427


In [0]:
#above function recommends 10 similar articles based on queried article It accepts two arguments -
#index of already read artile and the total number of articles to be recommended.
#since bag of words method  give immportance to most frequent words in our corpus we will further use tf-idf method

#tf-idf is a method which gives more weights to less frequent word this is calculated  by tf(term frequency in a document )
#and inverse-document-frequency
#TF(i,j) = (# times word i appears in document j) / (# words in document j)

#IDF(i,D) = log_e(#documents in the corpus D) / (#documents containing word i)

#weight(i,j) = TF(i,j) x IDF(i,D)
#So if a word occurs more number of times in a document but less number of times in all other documents 
#then its TF-IDF value will be high.



In [0]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(df2['news'])

In [18]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df0 = pd.DataFrame({'label': df['lebel'][indices].values,
               'headline':df['news'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',df['news'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df0.iloc[1:,]
tfidf_based_model(1770, 11)

headline :  Modi Sarkar ‘loses control’ over fuel prices again!



Unnamed: 0,label,headline,Euclidean similarity with the queried article
1,IndiaNonPolitical,🪔,1.0
2,IndiaNonPolitical,😍,1.0
3,IndianCinema,2-2-2-2-2-2,1.0
4,indianeconomy,Did Modi Govt Step Back On Its Stand of Fuel P...,1.109841
5,indianeconomy,Are Your Noticing The Daily-Price Revision Of ...,1.214224
6,IndiaNonPolitical,Late Night Thread to share your 'never again' ...,1.244199
7,IndiaNonPolitical,Late Night Thread to talk about friends you ma...,1.283268
8,rajneeti,Welcome Home Mr. Modi: (All my issues with Mod...,1.286871
9,Coronavirus,Germany starts opening up as coronavirus outbr...,1.290661
10,Coronavirus,Navajo Nation has lost more to coronavirus tha...,1.302002


In [0]:
#here we can see how are our results are greatly improved
#still tf-idf model doesn't capture semantics of news hedline for this we will use word_2_vec

In [20]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-04-25 18:08:47--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.0.174
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.0.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2020-04-25 18:09:04 (95.5 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [21]:
!pip install gensim
from gensim.models import KeyedVectors



In [0]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [23]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
vocabulary = list(word2vec.vocab)


In [0]:
w2v_headline = []
for i in df2['news']:
    w2Vec_word = np.zeros(300, dtype="float32")
    for word in i.split():
        if word in vocabulary:
            w2Vec_word = np.add(w2Vec_word, word2vec[word])
    w2Vec_word = np.divide(w2Vec_word, len(i.split()))
    w2v_headline.append(w2Vec_word)
vector_representation2 = pd.DataFrame(w2v_headline)

w2v_headline = np.array(w2v_headline)

In [0]:

#vector_representation.columns = ["lebel","news","authors","len","vec_representation"]
vector_representation.to_csv(r'/media/vector_representation2.csv',index=False)
vector_representation.to_csv(r'/media/df2.csv',index=False)

In [35]:
vocabulary

['</s>',
 'in',
 'for',
 'that',
 'is',
 'on',
 '##',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have',
 'he',
 'will',
 'has',
 '####',
 'his',
 'an',
 'this',
 'or',
 'their',
 'who',
 'they',
 'but',
 '$',
 'had',
 'year',
 'were',
 'we',
 'more',
 '###',
 'up',
 'been',
 'you',
 'its',
 'one',
 'about',
 'would',
 'which',
 'out',
 'can',
 'It',
 'all',
 'also',
 'two',
 'after',
 'first',
 'He',
 'do',
 'time',
 'than',
 'when',
 'We',
 'over',
 'last',
 'new',
 'other',
 'her',
 'people',
 'into',
 'In',
 'our',
 'there',
 'A',
 'she',
 'could',
 'just',
 'years',
 'some',
 'U.S.',
 'three',
 'million',
 'them',
 'what',
 'But',
 'so',
 'no',
 'like',
 'if',
 'only',
 'percent',
 'get',
 'did',
 'him',
 'game',
 'back',
 'because',
 'now',
 '#.#',
 'before',
 'company',
 'any',
 'team',
 'against',
 'off',
 'This',
 'most',
 'made',
 'through',
 'make',
 'second',
 'state',
 'well',
 'day',
 'season',
 'says',
 'w

In [36]:
def avg_w2v_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df0 = pd.DataFrame({'label': df['lebel'][indices].values,
               'headline':df['news'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',df['news'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df0.iloc[1:,]

avg_w2v_based_model(1770, 11)

headline :  Modi Sarkar ‘loses control’ over fuel prices again!



Unnamed: 0,label,headline,Euclidean similarity with the queried article
1,rajneeti,Welcome Home Mr. Modi: (All my issues with Mod...,0.756725
2,indianeconomy,India's fuel consumption to be higher in 18 mo...,0.758907
3,IndianCinema,"Just saw Lie [Telugu], can’t believe it’s the ...",0.770231
4,indianeconomy,EXPLAINED : India Slips 10 Places On WEF's Glo...,0.786239
5,IndiaNonPolitical,"Adi Shankaracharya, India's greatest philosoph...",0.792677
6,indianeconomy,Did Modi Govt Step Back On Its Stand of Fuel P...,0.794368
7,indianeconomy,Big push for Make in India: Ahead of PM Modi's...,0.795014
8,IndianCinema,"Netflix adds Q's Gandu, Tasher Desh, and Ludo;...",0.803299
9,IndiaNonPolitical,"New way to generate electricity, Ambani Bhai a...",0.804452
10,indianeconomy,"Time-tested tool to win elections, Modi lets I...",0.809651


In [0]:
#as soon as we started capturing semantics of news headline we can seehow our model accuracy improved

In [0]:
wordscointainedinourdataset = pd.DataFrame(vocabulary)

In [0]:
wordscointainedinourdataset.to_csv(r'/media/all_words.csv',index=False) # i have stored words encountered in my dataset in an csv file as it 
#will take time every time we run our web app