# TF-IDF based Recommender System

### Recommender System based on tf-idf as vector representation of documents

## 1. Importing Libraries

Importing libraries that using for reading file, tf-idf calculation, cosine similarity calculation

In [None]:
import pandas as pd
import pickle as pk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
import numpy
stemmer = SnowballStemmer("english")

## 2. Reading input dataset

Reading the csv file to get the Article id, Title and News Content

In [None]:
INPUT_FILE_PATH = 'IBTdataset1.csv'

In [2]:
inputfile = pd.read_csv(INPUT_FILE_PATH)

In [6]:
#Selecting contents and removing rows having na's
inputfile1 = inputfile[['Article_Id','Title','Content']].dropna()

Unnamed: 0,Article_Id,Title,Author,Date,Content,URL
0,0,14 dead after bus falls into canal in Telangan...,Devyani Sultania,"August 22, 2016 12:34 IST",At least 14 people died and 17 others were inj...,http://www.ibtimes.co.in/14-dead-after-bus-fal...
1,1,Pratibha Tiwari molested on busy road Saath ...,Suparno Sarkar,"August 22, 2016 19:47 IST",TV actress Pratibha Tiwari who is best known ...,
2,2,US South Korea begin joint military drill ami...,Namrata Tripathi,"August 22, 2016 18:10 IST",The United States and South Korea began a join...,http://www.ibtimes.co.in/us-south-korea-begin-...
3,3,Illegal construction in Bengaluru Will my hou...,S V Krishnamachari,"August 22, 2016 17:39 IST",The relentless drive by Bengaluru s Bangalore...,http://www.ibtimes.co.in/illegal-construction-...
4,4,Punjab Gau Rakshak Dal chief held for assaulti...,Pranshu Rathee,"August 22, 2016 17:34 IST",Punjab Gau Raksha Dal chief Satish Kumar and h...,http://www.ibtimes.co.in/punjab-gau-rakshak-da...
5,5,Phillipines drug war 1 800 drug-related death...,Pranshu Rathee,"August 22, 2016 14:51 IST",Philippines police on Monday said that the num...,
6,6,Infosys shares likely to fall on Tuesday after...,S V Krishnamachari,"August 15, 2016 16:03 IST",Infosys shares could fall sharply on Tuesday w...,http://www.ibtimes.co.in/infosys-shares-likely...
7,7,Dialogue crucial in finding permanent solution...,Pranshu Rathee,"August 22, 2016 21:11 IST",Prime Minister Narendra Modi has expressed dee...,
8,8,School bus overturns in Jammu killing 1 and in...,Devyani Sultania,"August 10, 2016 11:51 IST",A school bus in the town of Bishnah in Jammu s...,http://www.ibtimes.co.in/school-bus-overturns-...
9,9,Rajasthan Villagers rescue 50 kids after scho...,Devyani Sultania,"August 8, 2016 14:45 IST",A school bus carrying around 50 children fell ...,http://www.ibtimes.co.in/rajasthan-villagers-r...


In [None]:
#Articles to list
artlist = inputfile1['Content'].tolist()

In [8]:
#Saving the list of articles as pickle file
pk.dump(artlist,open("artlist.pkl","wb+"))

## 3. Data preprocessing

Data Preprocssing includes cleaning the data(removing punctuations and stopwords), tokenising, stemming, and vectorising the articles

In [13]:
def tokenizer_func(doc):
    #Function for tokenizing
    #removing punctuations
    doc1 = re.sub('[^\w_\s-]', ' ',doc)
    #Tokeninsing
    tokens = nltk.word_tokenize(doc1)
    #For stemming
    stems = [stemmer.stem(item) for item in tokens]
    return stems

In [14]:
def vectorizing(documents):
    #Function for vectorizing
    vectorizer = TfidfVectorizer(stop_words='english', min_df=2,
                                 tokenizer=tokenizer_func)
    X_data = vectorizer.fit_transform(documents)
    return X_data

In [15]:
def user_articles(article_ids,doc):
    #Function for combining input articles
    sen = ' '.join([doc[int(i)] for i in article_ids])
    doc.append(sen)
    return doc

## 4. Similarity match

Calculating the Cosine similarity between articles read and unread articles

In [16]:
def similarity(articles_id,tf_idf):
    #Function to calculate cosine similarity
    cs_matrix=cosine_similarity(tf_idf[-1], tf_idf)
    recommended_articles_id = numpy.concatenate(cs_matrix, axis=0).argsort()[:][::-1]
    #Remove read articles from recommendations
    final_recommended_articles_id = [art_id for art_id in recommended_articles_id 
                                     if art_id not in articles_id ][:5]
    return final_recommended_articles_id

In [18]:
#Reading the documents
documents = pk.load(open('artlist.pkl', 'rb+'))
articles_ids = [2,3]

In [None]:
#For combining the articles
new_art = user_articles(articles_ids,documents)
#For calculating the tf-idf vector
tf_idf = vectorizing(new_art)
#Recommendations
recommendations = similarity(articles_ids,tf_idf)

## 5. Results

In [None]:
#Recommended Articles and their title
#df_news = pd.read_csv(PATH_NEWS_ARTICLES)
print 'Articles Read'
print inputfile1.loc[inputfile1['Article_Id'].isin(articles_ids)]['Title']
print '\n'
print 'Recommender '
print inputfile1.loc[inputfile1['Article_Id'].isin(recommendations)]['Title']