<a href="https://colab.research.google.com/github/sarvesh237/NewsRecommenderIDC401/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [1]:
import pandas as pd
import numpy as np

#for lemmatization
import nltk
nltk.download("wordnet")

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# **Importing the collected data**

In [2]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [3]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, non-english characters, drop empty rows and duplicates.**

In [4]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

**Lemmatization**

In [5]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus

Unnamed: 0,Content
0,medium report about swedish bus manufacturer s...
1,access to covid vaccine cooperation on technol...
3,after severe criticism over not holding consul...
5,former congress president rahul gandhi on thur...
6,the enforcement directorate ha attached three ...
...,...
8483,over mughal era gold coin dating back to the e...
8484,china is planning to spend big in tibet a it n...
8485,the supreme court tuesday came out with a solu...
8486,indian american maju varghese who previously s...


# TF-IDF

In [6]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,___,aa,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aah,aai,aaj,aajtak,aakash,aaksha,aam,aamir,aamk,aandolan,aandolanjivi,aane,aap,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,...,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,zip,ziyad,zmn,zoa,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,zr,ztdrktlic,zte,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Cosine Similarity of documents**

In [8]:
pd.DataFrame(cosine_similarity(df_vecs))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593
0,1.000000,0.147324,0.139148,0.167852,0.090437,0.135713,0.136603,0.115569,0.159825,0.140653,0.085897,0.119441,0.074303,0.070751,0.096569,0.130708,0.126901,0.136447,0.111059,0.123579,0.183333,0.141878,0.161525,0.064698,0.179656,0.161806,0.095371,0.072338,0.092553,0.053254,0.150682,0.148900,0.146211,0.101240,0.102884,0.127528,0.172328,0.071197,0.132023,0.089694,...,0.019934,0.073042,0.050499,0.038389,0.056027,0.039649,0.040285,0.056811,0.052796,0.025022,0.072447,0.043371,0.041663,0.045965,0.024706,0.048294,0.085247,0.087563,0.095345,0.029460,0.042020,0.060456,0.027984,0.043121,0.045102,0.035471,0.059232,0.052966,0.059666,0.050956,0.047629,0.024459,0.087771,0.041589,0.069685,0.042418,0.091664,0.198145,0.073226,0.102734
1,0.147324,1.000000,0.184764,0.171659,0.133204,0.184621,0.207903,0.149453,0.207833,0.215219,0.114536,0.192144,0.114205,0.108438,0.123804,0.195049,0.281674,0.289697,0.167423,0.179612,0.251284,0.203846,0.213760,0.093721,0.251438,0.197520,0.141337,0.110737,0.114123,0.068943,0.191409,0.220227,0.198286,0.133751,0.179296,0.189156,0.238167,0.125724,0.143193,0.139838,...,0.049126,0.111551,0.084363,0.054711,0.117426,0.064915,0.061646,0.062963,0.086857,0.038380,0.071483,0.074914,0.070868,0.071084,0.043747,0.080815,0.122110,0.121788,0.120743,0.055823,0.061336,0.071526,0.037474,0.069128,0.077216,0.056302,0.074981,0.088166,0.086184,0.066999,0.079156,0.043185,0.136159,0.080652,0.134353,0.056035,0.139870,0.232831,0.128962,0.136051
2,0.139148,0.184764,1.000000,0.133305,0.120193,0.148798,0.161287,0.123372,0.159982,0.137795,0.083366,0.154378,0.087950,0.093422,0.089362,0.159291,0.181753,0.169996,0.138403,0.127714,0.189800,0.148137,0.155420,0.061008,0.188690,0.151885,0.110680,0.086942,0.083723,0.063415,0.148490,0.161385,0.143869,0.114068,0.132316,0.150988,0.173902,0.074944,0.119062,0.090774,...,0.026448,0.093380,0.055788,0.036708,0.066525,0.044992,0.041430,0.039376,0.067479,0.022235,0.095107,0.058411,0.043080,0.041121,0.033644,0.052533,0.089724,0.092936,0.084803,0.042555,0.054381,0.052030,0.046194,0.047194,0.057416,0.026627,0.062019,0.060802,0.059623,0.047339,0.060077,0.034150,0.111250,0.054072,0.114115,0.049810,0.098241,0.189905,0.071337,0.102552
3,0.167852,0.171659,0.133305,1.000000,0.084448,0.129935,0.114671,0.098613,0.197699,0.112960,0.078340,0.124111,0.059502,0.070784,0.129186,0.109626,0.133317,0.118244,0.091379,0.102448,0.187581,0.117864,0.133217,0.073244,0.139583,0.125829,0.081651,0.055189,0.091720,0.062801,0.132444,0.125816,0.120716,0.074755,0.124222,0.116350,0.136775,0.082367,0.092704,0.081593,...,0.034479,0.055697,0.036922,0.032294,0.103566,0.047512,0.033705,0.036686,0.051028,0.022754,0.055365,0.045340,0.033627,0.043444,0.024643,0.054455,0.052602,0.081401,0.060920,0.055124,0.039283,0.042255,0.041224,0.038688,0.042745,0.061851,0.042506,0.055952,0.041752,0.034423,0.041727,0.013173,0.073017,0.041991,0.073902,0.040212,0.068788,0.129865,0.062480,0.072706
4,0.090437,0.133204,0.120193,0.084448,1.000000,0.146401,0.121047,0.106380,0.138330,0.103118,0.058405,0.113907,0.085945,0.078902,0.075965,0.135913,0.120100,0.112084,0.111249,0.108325,0.150139,0.120982,0.170274,0.059638,0.154122,0.138460,0.093364,0.066147,0.069499,0.046825,0.135429,0.143552,0.135394,0.133059,0.101017,0.162087,0.151779,0.080479,0.092481,0.163384,...,0.019961,0.073463,0.040033,0.028629,0.050275,0.055929,0.053563,0.038827,0.050990,0.014529,0.054461,0.041390,0.033382,0.037954,0.036371,0.055846,0.084648,0.072595,0.083035,0.023739,0.042362,0.049276,0.019274,0.046016,0.039737,0.025772,0.046379,0.040358,0.049928,0.035512,0.045242,0.015950,0.096444,0.055016,0.076543,0.053981,0.069473,0.154634,0.063585,0.085205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.042418,0.056035,0.049810,0.040212,0.053981,0.064809,0.065680,0.050153,0.070223,0.049113,0.052659,0.054262,0.034683,0.033263,0.032559,0.056007,0.052913,0.068139,0.042442,0.049476,0.075291,0.074701,0.071339,0.038817,0.075649,0.075258,0.038292,0.038718,0.059374,0.029645,0.063249,0.056905,0.101899,0.033445,0.052131,0.065298,0.069698,0.055205,0.039239,0.089876,...,0.017856,0.039633,0.022566,0.043002,0.031733,0.023175,0.024772,0.024751,0.027485,0.020671,0.032881,0.020282,0.022572,0.028743,0.019044,0.036469,0.052879,0.032727,0.046827,0.008066,0.041172,0.032076,0.018908,0.021182,0.016314,0.031817,0.019230,0.019589,0.049857,0.055880,0.021200,0.020538,0.037661,0.042159,0.033664,1.000000,0.030468,0.072254,0.052648,0.051157
4590,0.091664,0.139870,0.098241,0.068788,0.069473,0.105683,0.093929,0.068942,0.106309,0.091916,0.062771,0.107963,0.054001,0.057852,0.046031,0.090498,0.120288,0.134196,0.076263,0.103853,0.114990,0.086268,0.118903,0.050849,0.116373,0.093094,0.075461,0.061551,0.075663,0.047560,0.093032,0.105624,0.105015,0.061862,0.118522,0.087635,0.115280,0.075117,0.067385,0.066126,...,0.026858,0.066897,0.046837,0.033231,0.053154,0.025087,0.035107,0.031449,0.037949,0.014857,0.052428,0.041165,0.028480,0.034725,0.027934,0.037689,0.059636,0.060972,0.061691,0.027524,0.046304,0.046242,0.020119,0.035910,0.050421,0.027038,0.034963,0.050536,0.051408,0.036711,0.039707,0.023760,0.074479,0.034140,0.079946,0.030468,1.000000,0.121703,0.070160,0.061452
4591,0.198145,0.232831,0.189905,0.129865,0.154634,0.244561,0.210777,0.172340,0.234884,0.191947,0.105977,0.195533,0.121996,0.126627,0.123269,0.261351,0.192213,0.203348,0.193681,0.192674,0.266855,0.217022,0.269048,0.098509,0.254401,0.217062,0.162843,0.151875,0.146363,0.079039,0.279152,0.253085,0.199955,0.199075,0.173379,0.256392,0.245404,0.121781,0.180380,0.137950,...,0.043581,0.133194,0.076894,0.063010,0.089937,0.058842,0.075139,0.063078,0.096683,0.037737,0.073052,0.074701,0.064316,0.068910,0.036479,0.092495,0.134864,0.128340,0.131170,0.055250,0.072132,0.080964,0.041656,0.077958,0.059895,0.048431,0.091416,0.096290,0.102168,0.073589,0.085498,0.042912,0.186872,0.065449,0.200082,0.072254,0.121703,1.000000,0.099816,0.156179
4592,0.073226,0.128962,0.071337,0.062480,0.063585,0.070832,0.086027,0.061474,0.097060,0.070898,0.038610,0.079425,0.070940,0.044949,0.090869,0.097384,0.077246,0.081090,0.069058,0.085073,0.108262,0.083908,0.109210,0.029611,0.139434,0.103745,0.069564,0.052293,0.043012,0.032904,0.114399,0.107598,0.095183,0.058758,0.068124,0.088449,0.105054,0.058074,0.060543,0.059158,...,0.019873,0.078829,0.031935,0.028325,0.064045,0.023565,0.028241,0.032056,0.052687,0.012455,0.034990,0.046214,0.021895,0.035386,0.019101,0.036781,0.063199,0.048781,0.054323,0.031229,0.070504,0.035240,0.027659,0.025995,0.027095,0.040746,0.030953,0.027544,0.038391,0.036329,0.050164,0.022957,0.061848,0.037088,0.049334,0.052648,0.070160,0.099816,1.000000,0.087339
