In [None]:
# default_exp models.tfidf

# TF-IDF Recommender
> Implementation of tf-idf content-based recommender model.

TF-IDF (Term Frequency — Inverse Document Frequency) calculates how important words are in relation to the whole document. TF summarizes how often a given word appears within a document. IDF downscales words that appear frequently across documents. This allows TF-IDF to define the importance of words within a document based on the relationship and weighting factor.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
#export
class TFIDFRecommender:
    """
    TF-IDF (Term Frequency — Inverse Document Frequency) calculates how important 
    words are in relation to the whole document. TF summarizes how often a given 
    word appears within a document. IDF downscales words that appear frequently 
    across documents. This allows TF-IDF to define the importance of words within 
    a document based on the relationship and weighting factor.
    """
    def __init__(self, ngram_range=(1,2), min_df=50, analyzer='word', stop_words='english'):
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.analyzer = analyzer
        self.stop_words = stop_words

    def clean_data(self):
        # selecting only id and text columns
        self.data = self.data[[self.id_col, self.text_col]]
        # dropping rows with NA in id/text column
        self.data = self.data.dropna()
        # make data type string
        self.data = self.data.astype('str')

    def fit(self, data, id_col='id', text_col='text'):
        self.data = data
        self.id_col = id_col
        self.text_col = text_col
        self.clean_data()
        self.ids = self.data[self.id_col]
        self.text = self.data[self.text_col]
        tf = TfidfVectorizer(analyzer=self.analyzer,
                             ngram_range=self.ngram_range,
                             min_df=self.min_df,
                             stop_words=self.stop_words)
        tfidf_matrix = tf.fit_transform(self.text)
        # Use numeric values to find similarities
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        self.indices = pd.Series(self.data.index, index=self.ids)
        self.indices = self.indices.to_dict()
    
    def _recommend(self, id):
        idx = self.indices[id]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # sim_scores = sim_scores[1:11] # How many results to display
        item_indices = [i[0] for i in sim_scores]
        id_df = pd.DataFrame({self.id_col: self.ids.iloc[item_indices].tolist(),
                              'similarity': [i[1] for i in sim_scores],
                              'text': self.text.iloc[item_indices].tolist()}, 
                             index=item_indices)
        return id_df
        
    def recommend(self, id, topk=10):

        id = str(id)

        # get recommended items
        rec_df = self._recommend(id)
        rec_df = rec_df.dropna()
        
        # get text of the target item
        rec_item_text = self.data[self.data[self.id_col] == id][self.text_col].to_list()[0].split()
        
        # create dictionary of text lists by item id
        item_text_dict = {}
        for id in rec_df[self.id_col].tolist():
            item_text_dict[id] = self.data[self.data[self.id_col] == id][self.text_col].to_list()
        
        # create dictionary of text statistics by item id
        text_stats = {}
        for item, text in item_text_dict.items():
            text = text[0].split()
            text_stats[item] = {}
            text_stats[item]['total_text'] = len(text)
            same_text = set(rec_item_text).intersection(set(text)) # Get text in recommended item that are also in target item
            text_stats[item]['%_common_text'] = (len(same_text) / len(text)) * 100
        
        # convert dictionary to dataframe
        text_stats_df = pd.DataFrame.from_dict(text_stats, orient='index').reset_index().rename(columns={'index': self.id_col})
        
        # merge text statistics dataframe to recommended items dataframe
        all_stats_df = pd.merge(rec_df, text_stats_df, on=self.id_col)
        return all_stats_df.iloc[1:topk+1]

Example

In [None]:
!wget -q --show-progress https://github.com/RecoHut-Datasets/goodreads/raw/v3/books_combined.csv



In [None]:
data = pd.read_csv('books_combined.csv', usecols=['isbn','tag_name'])
data.head()

Unnamed: 0,isbn,tag_name
0,439023483,to-read fantasy favorites currently-reading yo...
1,439554934,to-read fantasy favorites currently-reading yo...
2,316015849,to-read fantasy favorites currently-reading yo...
3,61120081,to-read favorites currently-reading young-adul...
4,743273567,to-read favorites currently-reading young-adul...


In [None]:
model = TFIDFRecommender()
model.fit(data, id_col='isbn', text_col='tag_name')

In [None]:
model.recommend(id=316015849, topk=5)

Unnamed: 0,isbn,similarity,text,total_text,%_common_text
1,316160199,0.836174,to-read fantasy favorites currently-reading yo...,100,86.0
2,316160202,0.832712,to-read fantasy favorites currently-reading yo...,100,86.0
3,739352350,0.756397,to-read fantasy favorites currently-reading yo...,100,76.0
4,006114097X,0.61228,to-read fantasy favorites currently-reading yo...,100,62.0
5,316043133,0.598861,to-read fantasy favorites currently-reading yo...,100,59.0


In [None]:
!wget -q --show-progress https://github.com/RecoHut-Datasets/hackernews/raw/v1/HackerNews.csv



In [None]:
df = pd.read_csv('HackerNews.csv')
df.head()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,Ask HN: How Apple encrypt iCloud photos?,,"Here, at HN, we all care about privacy.<p>Appl...",,ex3ndr,5,1511298567,2017-11-21 21:09:27+00:00,story,15751571,,0.0,,
1,Ask HN: Best consumer EEG hardware and software?,,What&#x27;s the best consumer level EEG hardwa...,,hanniabu,5,1486834334,2017-02-11 17:32:14+00:00,story,13623602,,3.0,,
2,Ask HN: Devs: How did you go from writing just...,,Short background: I’ve been a developer for 3 ...,,krptos,5,1483362724,2017-01-02 13:12:04+00:00,story,13300955,,4.0,,
3,Ask HN: Are exceptional programmers highly opi...,,It seems to me that many respected programmers...,,afco,5,1486402505,2017-02-06 17:35:05+00:00,story,13581818,,3.0,,
4,Ask HN: Does your organization use SecDevOps?,,What approach do you take for data and infrast...,,mngutterman,5,1487086924,2017-02-14 15:42:04+00:00,story,13644257,,5.0,,


In [None]:
model = TFIDFRecommender()
model.fit(df, id_col='id', text_col='text')

In [None]:
model.recommend(id='13581818', topk=5)

Unnamed: 0,id,similarity,text,total_text,%_common_text
1,15954634,0.521641,This is as divisive a movie as I&#x27;ve ever ...,26,11.538462
2,13567816,0.451331,I&#x27;m working a side project that would pro...,47,12.765957
3,13755673,0.41378,I&#x27;m getting<p>{\n &quot;errorCode&quot; ...,16,12.5
4,15280107,0.408286,Just wondering on what people think about this...,52,13.461538
5,13937289,0.391885,"Perhaps a trivial question, but I think it&#x2...",82,13.414634


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-29 18:22:00

recohut: 0.0.12

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas    : 1.1.5
IPython   : 5.5.0
numpy     : 1.19.5
PIL       : 7.1.2
matplotlib: 3.2.2

