## Recommender

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_excel('/content/drive/MyDrive/Punch Project/The Punch Cleaned File BackUp.xlsx')

# Create the TfidfVectorizer Object
tfidf = TfidfVectorizer(max_features = 2000)

# Create a matrix of word vectors
tfidf_matrix = tfidf.fit_transform(data['CLEANED DATA'])

print(tfidf_matrix.toarray())


[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.11067622 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Cosine Similarity

In [4]:
# Let's look at the similarities between each article
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

print(cosine_sim)

[[1.         0.25958525 0.11983868 ... 0.06032417 0.06085939 0.02298239]
 [0.25958525 1.         0.105112   ... 0.06218471 0.05805345 0.05536638]
 [0.11983868 0.105112   1.         ... 0.09820773 0.08033982 0.05735282]
 ...
 [0.06032417 0.06218471 0.09820773 ... 1.         0.05524081 0.05649724]
 [0.06085939 0.05805345 0.08033982 ... 0.05524081 1.         0.11071747]
 [0.02298239 0.05536638 0.05735282 ... 0.05649724 0.11071747 1.        ]]


In [5]:
# Map article links to their indices(index)
indices = pd.Series(data.index, index=data['URL'])
indices

URL
https://punchng.com/presidential-inauguration-obi-didnt-call-for-boycott-postponement-lp/                                 0
https://punchng.com/lets-reclaim-pdp-lost-glory-atiku-tasks-party-members/                                                1
https://punchng.com/tinubull-be-fair-to-all-ex-lawmaker/                                                                  2
https://punchng.com/just-in-atiku-obaseki-attend-pdps-reception-for-new-returning-governors/                              3
https://punchng.com/im-the-best-candidate-for-senate-president-osita-izunaso-insists/                                     4
                                                                                                                       ... 
https://healthwise.punchng.com/lessons-from-covid-19-should-stimulate-fg-stakeholders-to-fund-niprd-dg/                 580
https://healthwise.punchng.com/nutrition-experts-task-govt-on-food-fortification-enforcement/                           581
http

In [6]:
# Map article links to their indices(index)
indices = pd.Series(data.index, index=data['URL'])

def get_recommendation(link, cosine_sim, indices):
  idx = indices[link]

  # Create a list of tuples where the first element is the index of the article and
  # the second element is the cosine similarity of the article with the article above
  sim_scores = list(enumerate(cosine_sim[idx]))

  # Sort the article not by the index but the second element in the tuple which is the cosine similarity
  # Reverse is True because it will sort the values from highest to lowest
  sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

  # Show only the top 10 similar articles
  # We start from 1 because the article with the highest cosine similarity (1) is the article itself
  sim_scores = sim_scores[1:11]
  print(sim_scores)

  # Get the article indices
  article_indices = [i[0] for i in sim_scores]

  # Map the article indices to the article link
  top_10 = data['URL'].iloc[article_indices]

  return top_10

In [7]:
url = 'https://punchng.com/presidential-inauguration-obi-didnt-call-for-boycott-postponement-lp/'
get_recommendation(url, cosine_sim, indices)

[(5, 0.704063136767388), (18, 0.4998289112652835), (86, 0.48325695576521577), (10, 0.47454218759559896), (79, 0.41493159203594066), (50, 0.3820443131503972), (65, 0.3557915350612729), (53, 0.34602956820197267), (26, 0.3006930720509954), (64, 0.2948958252809115)]


5     https://punchng.com/lp-faction-wants-tinubu-sw...
18    https://punchng.com/breaking-court-adjourns-lp...
86    https://punchng.com/tinubus-victory-legal-batt...
10    https://punchng.com/obi-lp-back-atiku-on-reque...
79    https://punchng.com/prosecute-interim-govt-adv...
50    https://punchng.com/post-election-crises-and-r...
65    https://punchng.com/may-29-handover-again-mili...
53    https://punchng.com/part-time-legislature-not-...
26    https://punchng.com/presidential-poll-tribunal...
64    https://punchng.com/onaiyekan-spoke-truth-to-p...
Name: URL, dtype: object