<a href="https://colab.research.google.com/github/secoxx/IE423/blob/main/task_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialize

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Movie metadata
dfJks = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/jokes/JokeText.csv')

# User ratings for each movie
dfRtg1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/jokes/UserRatings1.csv')

In [6]:
dfJks.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [7]:
dfRtg1

Unnamed: 0,JokeId,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User36701,User36702,User36703,User36704,User36705,User36706,User36707,User36708,User36709,User36710
0,0,5.10,-8.79,-3.50,7.14,-8.79,9.22,-4.03,3.11,-3.64,...,,,,,,,,,2.91,
1,1,4.90,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,-3.35,...,,,,-5.63,,-6.07,,-1.60,-4.56,
2,2,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,-6.46,...,,,,,,4.08,,,8.98,
3,3,-4.17,-4.61,-0.10,0.05,8.98,9.27,-6.99,0.49,-3.40,...,,,,,,,,,,
4,4,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,1.26,...,2.28,-0.49,5.1,-0.29,-3.54,-1.36,7.48,-5.78,0.73,2.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,6.31,-1.02,3.98,3.93,9.13,1.94,0.44,1.21,6.94,...,,,,,,,,,,
96,96,-4.95,-0.97,-6.46,-2.57,9.17,1.99,-0.78,5.34,5.83,...,,,,,5.87,,,,,
97,97,-0.19,4.13,-6.89,1.07,9.17,3.45,-1.02,1.94,5.53,...,,,,,,,,,,
98,98,3.25,-1.84,-2.33,2.33,9.08,9.17,1.70,3.06,6.55,...,,,,,,3.64,,,,


## Build Recommendations

### 1. Content Based Filtering

#### Prepare data

In [10]:
dfJks.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [8]:
dfJks.shape

(100, 2)

#### Build Model

In [13]:
# Generate a matrix of common terms that show up in each joke

from sklearn.feature_extraction.text import TfidfVectorizer
mdlTfvMvs = TfidfVectorizer(analyzer='word',ngram_range=(1, 2), stop_words='english')
tfidf_matrix = mdlTfvMvs.fit_transform(dfJks['JokeText'])
tfidf_matrix.shape

(100, 3774)

In [14]:
# Calculate cosine similarity between each pair of jokes as a function of the similarity of the common terms

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(100, 100)

#### Predict

In [17]:
# Prepare recommendation function
def get_recommendations(JokeId):
    sim_scores = list(enumerate(cosine_sim[JokeId]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    joke_indices = [i[0] for i in sim_scores]
    return dfJks.iloc[joke_indices]

In [18]:
get_recommendations(0).head(10)

Unnamed: 0,JokeId,JokeText
86,86,"A man, recently completing a routine physical ..."
67,67,A man piloting a hot air balloon discovers he ...
87,87,A Czechoslovakian man felt his eyesight was gr...
75,75,There once was a man and a woman that both go...
31,31,A man arrives at the gates of heaven. St. Pete...
38,38,What is the difference between men and women:\...
55,55,A man and Cindy Crawford get stranded on a des...
80,80,An Asian man goes into a New York CityBank to ...
32,32,What do you call an American in the finals of ...
3,3,Q. What's the difference between a man and a t...


Content based recommendation doesn't take into account the user and their preferences. The recommendation is made based on joke content. In the example, I provided joke 0 as input. In that joke, there was a man as the subject. And there's also a man in all the recommended jokes. So the recommendation doesn't look too far off.

### 2. Collaborative Filtering

#### Prepare data

In [22]:
dfRtg1.shape

(100, 36711)

In [23]:
dfRtg1p = pd.melt(dfRtg1,id_vars=['JokeId'],var_name='User',value_name='Rating')

In [24]:
dfRtg1p

Unnamed: 0,JokeId,User,Rating
0,0,User1,5.10
1,1,User1,4.90
2,2,User1,1.75
3,3,User1,-4.17
4,4,User1,5.15
...,...,...,...
3670995,95,User36710,
3670996,96,User36710,
3670997,97,User36710,
3670998,98,User36710,


In [33]:
dfRtg1p = dfRtg1p.dropna(subset=['Rating'])

In [34]:
dfRtg1p

Unnamed: 0,JokeId,User,Rating
0,0,User1,5.10
1,1,User1,4.90
2,2,User1,1.75
3,3,User1,-4.17
4,4,User1,5.15
...,...,...,...
3670967,67,User36710,3.59
3670968,68,User36710,5.39
3670969,69,User36710,4.71
3670980,80,User36710,0.97


#### Build Model

In [35]:
# Prepare data into Surprise library format

!pip3 install scikit-surprise #or !conda install -c conda-forge scikit-surprise
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(0,5))
X = Dataset.load_from_df(dfRtg1p, reader)
X_train, X_test = train_test_split(X, test_size=.25)



In [36]:
# Define SVD model

from surprise import SVD

mdlSvdRtg = SVD()

In [37]:
# Fit SVD model

mdlSvdRtg.fit(X_train)
test_pred = mdlSvdRtg.test(X_test)

In [38]:
# Evalute SVD accuracy

from surprise import accuracy

accuracy.rmse(test_pred)

RMSE: 4.6128


4.61282994097576

In [39]:
# Tune hyperparameters

from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(X)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

KeyboardInterrupt: 

In [40]:
# Cross-validate

from surprise.model_selection import cross_validate

cross_validate(mdlSvdRtg, X, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KeyboardInterrupt: 

#### Predict

In [45]:
dfRtg1p[dfRtg1p['User'] == 'User36710']

Unnamed: 0,JokeId,User,Rating
3670904,4,User36710,2.62
3670905,5,User36710,3.3
3670906,6,User36710,0.53
3670907,7,User36710,-2.62
3670909,9,User36710,3.06
3670910,10,User36710,0.49
3670911,11,User36710,6.02
3670912,12,User36710,-8.88
3670913,13,User36710,3.3
3670914,14,User36710,-0.49


In [47]:
mdlSvdRtg.predict(1, 82)

Prediction(uid=1, iid=82, r_ui=None, est=0.2778209036863737, details={'was_impossible': False})