In [1]:
import pandas as pd 
import numpy as np

### Task as Follows:
You have to build a Recommendation System, recommending the items based on the following.
1. Content Based Filtering
2. Collaborative Filtering

End result should be a system that.:

1. Recommend posts for the given user.
2. Recommend similar posts for the given post

### Dataset Details

There are 3 files.

- users.csv Users dataset containing user's details like name, id, gender etc.
- posts.csv Post dataset containing posts details like title category etc.
- views.csv Views dataset contains the mapping which user views which post(s)

Users:
- _id: a unique alphanumeric id of the user (string)
- name: Name of user (string)
- gender: Gender of user (male | female)
- academics: Education of the use (undergraduate | graduate)

Posts:
- _id: a unique alphanumeric id of the post (string)
- title: Title of the post (string)
- category: Category of the post (string)
- post_type: Type of the post (blog | artwork | skill | project)

Views:
- user_id : a unique alphanumeric id of the user (string)
- post_id : a unique alphanumeric id of the post (string)
- time stamp: timestamp of when user viewed the post (ISO time format)

In [2]:
users = pd.read_csv('users.csv')
posts = pd.read_csv('posts.csv')
views = pd.read_csv('views.csv')

In [3]:
users.describe()

Unnamed: 0,_id,name,gender,academics
count,118,118,118,118
unique,118,118,3,3
top,5e5dfbbefbc8805f69e02c91,Nimish Sharma,male,undergraduate
freq,1,1,72,68


In [4]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   _id         493 non-null    object
 1   title       493 non-null    object
 2   category    465 non-null    object
 3    post_type  493 non-null    object
dtypes: object(4)
memory usage: 15.5+ KB


In [5]:
posts.describe()

Unnamed: 0,_id,title,category,post_type
count,493,493,465,493
unique,493,477,231,4
top,5e4d359cf5561b1994c8e424,PENCIL RENDERING,Photography,artwork
freq,1,3,81,241


In [6]:
posts['category'] = posts['category'].fillna('')

In [7]:
posts[' post_type'].unique()

array(['blog', 'artwork', 'project', 'skill'], dtype=object)

In [8]:
views.describe()

Unnamed: 0,user_id,post_id,timestamp
count,1449,1449,1449
unique,118,495,1449
top,5d60098a653a331687083238,5ec1fd0974f7660d73aa0fd5,2020-05-18T09:34:26.487Z
freq,230,18,1


In [9]:
users.head()

Unnamed: 0,_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


In [10]:
posts.head()

Unnamed: 0,_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


In [11]:
views.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


## Content Based Filtering

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tf = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2))
title_features = tf.fit_transform(posts['title'])

In [14]:
tf2 = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2))
category_features = tf2.fit_transform(posts['category'])

In [15]:
type_features = pd.get_dummies(posts[' post_type'])

In [16]:
type_features

Unnamed: 0,artwork,blog,project,skill
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,1,0,0,0
4,0,1,0,0
...,...,...,...,...
488,0,1,0,0
489,0,1,0,0
490,0,0,1,0
491,0,0,0,1


In [17]:
category_features

<493x693 sparse matrix of type '<class 'numpy.float64'>'
	with 2157 stored elements in Compressed Sparse Row format>

In [18]:
from scipy.sparse import hstack, csr_matrix

In [19]:
final_features = csr_matrix(hstack([title_features, category_features, type_features]))

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
matrix = cosine_similarity(final_features, final_features)

In [22]:
matrix.shape

(493, 493)

In [26]:
#Function to find recommendation using title of a post
def give_recomm_posts(title, matrix=matrix, no_recommdation=5):
    idx = posts[posts['title']==title].index[0]
    
    sim_scores = list(enumerate(matrix[idx]))
    
    sorted_sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
    
    recomm = pd.DataFrame([(posts['title'].iloc[i[0]], i[1]) for i in sorted_sim_scores[1:no_recommdation+1]], columns=['posts','similarity_score'])
    
    return recomm

In [27]:
give_recomm_posts('Lord Shiva', no_recommdation=10)

Unnamed: 0,posts,similarity_score
0,Lord shiva,1.0
1,Shiva Portrait,0.766209
2,Trident,0.666667
3,Kid Bu,0.666667
4,Shadow Sketch,0.666667
5,The Chosen,0.666667
6,Spartan,0.666667
7,ROMAN REIGNS,0.666667
8,OM,0.666667
9,MK Swords,0.666667


## Collaborative Filtering

In [28]:
# Function to find recommendation using userID
def give_recomm_user(userID, no_recommdation=5):
    user_watchlist_id = views[views['user_id']==userID]['post_id']
    
    user_recommend = pd.DataFrame(columns = ['posts', 'similarity_score'])
    
    # Adding similar posts for a given post
    for post_id in user_watchlist_id:
        title = posts[posts['_id'] == post_id]['title'].values[0]
        posts1 = give_recomm_posts(title)
        user_recommend = user_recommend.append(posts1, ignore_index=True)
    
    # Removing the posts which the user has watched alredy
    for post_id in user_watchlist_id:
        user_recommend = user_recommend[user_recommend['posts']!=posts[posts['_id'] == post_id]['title'].values[0]]
        
    # Sorting by similarity_score in decsending order
    user_recommend = user_recommend.sort_values(ascending = False, by = 'similarity_score').drop_duplicates()
    
    return user_recommend.iloc[:no_recommdation]
        

In [29]:
give_recomm_user('5df49b32cc709107827fb3c7', no_recommdation=20)

Unnamed: 0,posts,similarity_score
42,The Making of Daaku,0.928773
79,Be yourself !!,0.816497
33,Alone,0.816497
27,STILL,0.816497
78,Too much?,0.816497
3,Recommend Systems Machine Learning,0.783859
4,Learning...,0.776922
65,Dusk sky.,0.768462
52,Smart City,0.666667
43,The Power of Indian Audience.,0.666667


In [30]:
# Find recommendation using title of a post
title = "POST_TITLE"
give_recomm_posts(title)

In [None]:
# Find recommendation using user_id
userID = "USER_ID"
give_recomm_user(userID)