<a href="https://colab.research.google.com/github/sarvesh237/NewsRecommenderIDC401/blob/master/NewsRecommenderAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [None]:
import pandas as pd
import numpy as np

#for lemmatization
import nltk
nltk.download("wordnet")

#for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

#gaussian mixture
from sklearn.mixture import GaussianMixture

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Importing the collected data**

In [None]:
df_1 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/agrima_news_data.csv")
df_2 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/sarvesh_news_data.csv")
df_3 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/satender_news_data.csv")
df_4 = pd.read_csv("https://github.com/sarvesh237/NewsRecommenderIDC401/raw/master/vishal_news_data.csv")

# **Preprocessing the csv files.**

In [None]:
del df_1['Unnamed: 0']
del df_2['index']
del df_2['Unnamed: 0']
df_2.rename(columns = {'content':'Content'}, inplace = True)
del df_3['Unnamed: 0']
del df_3['Title']
del df_4['Unnamed: 0']

**Remove capitalization, numbers, non-english characters, drop empty rows and duplicates.**

In [None]:
news_corpus = pd.concat([df_1,df_2,df_3,df_4],ignore_index=True)
news_corpus = news_corpus.dropna() #dropping NaN
news_corpus = news_corpus[news_corpus.Content != ''] #dropping empty rows
news_corpus = news_corpus.drop_duplicates()
news_corpus["Content"] = news_corpus['Content'].str.replace('[^\w\s]',' ')
news_corpus = news_corpus[news_corpus['Content'].map(lambda x: x.isascii())] #remove non-english #find a better way
news_corpus['Content'] = news_corpus['Content'].str.lower() #convert to lowercase
news_corpus.replace('\d+', '', regex=True, inplace=True) #remove numbers

**Lemmatization**

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

news_corpus['Content'] = news_corpus.Content.apply(lemmatize_text)
news_corpus['Content'] = news_corpus['Content'].apply(', '.join) #convert list of words to a sentence
news_corpus.replace(',', '', regex=True, inplace=True) # remove commas
news_corpus

Unnamed: 0,Content
0,medium report about swedish bus manufacturer s...
1,access to covid vaccine cooperation on technol...
3,after severe criticism over not holding consul...
5,former congress president rahul gandhi on thur...
6,the enforcement directorate ha attached three ...
...,...
8483,over mughal era gold coin dating back to the e...
8484,china is planning to spend big in tibet a it n...
8485,the supreme court tuesday came out with a solu...
8486,indian american maju varghese who previously s...


# TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news_corpus.Content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
#denselist = dense.tolist() #very expensive
df_vecs = pd.DataFrame(dense, columns=feature_names)
df_vecs

Unnamed: 0,___,aa,aabad,aadarsh,aadat,aadhaar,aadhar,aadhi,aadmi,aage,aah,aai,aaj,aajtak,aakash,aaksha,aam,aamir,aamk,aandolan,aandolanjivi,aane,aap,aapada,aapko,aapsu,aaravv,aarogya,aarohan,aarti,aasha,aashay,aashirwad,aasiya,aastha,aasu,aate,aatma,aatmanidbhar,aatmanirbhar,...,zilla,zillion,zimbabwe,zindabad,zindagi,zindagii,zinta,zip,ziyad,zmn,zoa,zojila,zolgensma,zomato,zombie,zonal,zone,zongqi,zoo,zoological,zoology,zoom,zoonotic,zoramthanga,zothankhuma,zoya,zptcs,zr,ztdrktlic,zte,zuali,zubair,zubin,zurbuchen,zurich,zuxxmlt,zwift,zyada,zycov,zydus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Cosine Similarity of documents**

In [None]:
cos_sim = pd.DataFrame(cosine_similarity(df_vecs))

**Dimensionality Reduction using TruncatedSVD**

In [None]:
from sklearn.decomposition import TruncatedSVD as tsvd # TruncatedSVD

sv_dec = tsvd(n_components = 50)  # components(no. of latent factors) < n ; n : no. of docs.
ls_analysis = sv_dec.fit_transform(vectors.T)

sv_dec.singular_values_ # eigenvalues for svd (in decreasing order)

array([23.845926  ,  7.72855992,  6.19650048,  6.0402855 ,  5.69620242,
        5.10092136,  5.02596021,  4.56020622,  4.36979673,  4.19021189,
        4.10501923,  3.9713136 ,  3.86764193,  3.82358631,  3.76936822,
        3.71941179,  3.65870482,  3.55604941,  3.53027157,  3.46103935,
        3.40020566,  3.37232255,  3.29557363,  3.24682417,  3.20876414,
        3.18881019,  3.13760082,  3.08919705,  3.08294451,  3.05649451,
        2.99846875,  2.96932092,  2.95135255,  2.89955533,  2.84277165,
        2.83026494,  2.81577672,  2.80253247,  2.76154598,  2.73305338,
        2.71702103,  2.68126532,  2.67729425,  2.62662137,  2.61322548,
        2.56655838,  2.54617147,  2.54098196,  2.51817098,  2.51025437])

In [None]:
variance =  sv_dec.singular_values_/sum(sv_dec.singular_values_) # checking the variance captured by eigenvalues
variance

array([0.12110806, 0.03925161, 0.03147062, 0.03067724, 0.02892972,
       0.02590643, 0.02552571, 0.02316026, 0.02219321, 0.02128114,
       0.02084846, 0.0201694 , 0.01964288, 0.01941913, 0.01914377,
       0.01889005, 0.01858173, 0.01806037, 0.01792945, 0.01757784,
       0.01726888, 0.01712726, 0.01673747, 0.01648989, 0.01629659,
       0.01619525, 0.01593516, 0.01568933, 0.01565758, 0.01552324,
       0.01522854, 0.01508051, 0.01498925, 0.01472619, 0.01443779,
       0.01437428, 0.01430069, 0.01423343, 0.01402527, 0.01388056,
       0.01379914, 0.01361754, 0.01359737, 0.01334002, 0.01327198,
       0.01303497, 0.01293143, 0.01290507, 0.01278922, 0.01274901])

In [None]:
sv_dec = tsvd(n_components = 25) # components with accepted variance
las = sv_dec.fit_transform(vectors)

print(las,las.shape)

[[ 0.31824256 -0.04775671 -0.02696336 ... -0.01134346 -0.00664683
   0.03182952]
 [ 0.48084741 -0.02173952  0.04279292 ... -0.07913612  0.02443928
   0.03063587]
 [ 0.35765976 -0.03484471 -0.03269956 ... -0.01545872 -0.08946901
   0.23707331]
 ...
 [ 0.48546903 -0.06347321 -0.09713542 ...  0.00996699 -0.04344497
  -0.00835248]
 [ 0.20792629 -0.03555445 -0.01013738 ...  0.01521086 -0.02536889
  -0.03673931]
 [ 0.30945151 -0.05680526  0.1017254  ... -0.03049174 -0.00094738
  -0.01009277]] (4594, 25)


In [None]:
# Representation of each doc in terms of latent topics after dimensionality reduction
import pandas as pd

col = []
for i in range(las.shape[1]):
    col.append("topic_{}".format(i)) 

topic_df = pd.DataFrame(las, columns = col)

topic_df["Docs"] = news_corpus.Content

l = ["Docs"]
for i in col:
    l.append(i)


display(topic_df[l])


Unnamed: 0,Docs,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,medium report about swedish bus manufacturer s...,0.318243,-0.047757,-0.026963,0.029785,-0.050186,-0.021544,0.014351,0.028829,0.033227,-0.010598,0.032632,-0.014416,-0.015910,-0.027256,-0.022089,0.028747,-0.052540,-0.014068,-0.017795,0.018179,-0.036943,-0.003575,-0.011343,-0.006647,0.031830
1,access to covid vaccine cooperation on technol...,0.480847,-0.021740,0.042793,0.127056,-0.048243,-0.109129,-0.013917,-0.001361,-0.013216,-0.073437,0.073325,-0.039723,-0.046461,0.098654,0.014748,-0.097240,-0.049448,-0.030823,0.047112,0.012964,0.004154,0.002037,-0.079136,0.024439,0.030636
2,,0.357660,-0.034845,-0.032700,0.043747,-0.028094,-0.040252,0.060718,0.004928,-0.017431,-0.001154,0.012160,-0.014998,-0.027921,0.030406,0.013864,0.061594,-0.033733,-0.055028,-0.029967,-0.009042,-0.002928,0.025556,-0.015459,-0.089469,0.237073
3,after severe criticism over not holding consul...,0.282938,-0.012479,0.012869,0.021490,-0.036654,-0.041669,-0.003599,0.047104,0.044301,-0.026082,0.051033,-0.036596,0.024945,0.095621,-0.079491,-0.008158,-0.033992,0.007400,-0.021276,-0.005089,-0.044011,0.020767,-0.004803,-0.062571,0.090573
4,,0.297353,-0.042550,-0.093085,-0.033139,-0.087247,0.034809,0.014512,-0.065095,0.010613,-0.002285,0.031443,-0.040712,0.038464,-0.044652,-0.098990,0.026255,-0.084424,-0.000170,-0.008129,-0.077946,-0.064585,-0.008610,0.020743,0.028020,-0.008150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4589,new delhi india february ani the delhi high co...,0.147760,-0.015669,-0.038010,-0.025006,-0.057806,0.042946,-0.086339,-0.023846,0.036056,-0.008417,0.026478,-0.005322,0.041252,-0.021018,0.005164,0.029506,-0.034642,-0.006627,-0.010078,0.001947,0.001615,-0.006779,-0.022476,-0.024291,-0.014518
4590,darjeeling west bengal india february ani unio...,0.248173,0.006348,-0.002785,-0.001993,-0.020299,-0.089167,-0.023737,-0.016161,-0.028619,-0.022896,-0.011031,-0.037516,-0.046360,-0.015089,0.004454,-0.027411,-0.023251,-0.030859,0.033698,0.016540,-0.018446,-0.019180,-0.033124,0.003004,-0.001837
4591,new delhi india february ani the second phase ...,0.485469,-0.063473,-0.097135,0.011161,-0.079622,0.002891,0.163660,-0.022623,-0.014435,0.012394,0.011179,-0.021901,-0.014549,-0.018421,-0.009762,0.010709,0.004182,0.012183,0.025130,0.045171,0.003420,-0.015591,0.009967,-0.043445,-0.008352
4592,anantnag jammu and kashmir india february ani ...,0.207926,-0.035554,-0.010137,0.014522,-0.028177,-0.035569,-0.001296,-0.064796,0.036417,0.004680,0.052034,0.005750,-0.032616,0.078083,0.049317,0.011721,-0.033267,-0.022928,0.026534,0.030957,-0.037169,-0.017654,0.015211,-0.025369,-0.036739
