In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/content/jokes-data.csv')

#Preprocessing And Training

In [None]:
data.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [None]:
#finding the missing values
data.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [None]:
#finding the total number of rows
len(data)

1092059

In [None]:
#find duplicate rows
data.duplicated().sum()

0

In [None]:
#colum names
data.columns

Index(['id', 'user_id', 'joke_id', 'Rating'], dtype='object')

In [None]:
#The ratings in the 'Rating' column are modified using a lambda function
data['Rating'] = data['Rating'].apply(lambda x: 10^-15 if x == 0 else x)
#The values in the 'user_id' and 'joke_id' columns are decremented by 1.
data['user_id'] = data['user_id'].apply(lambda x: x - 1)
data['joke_id'] = data['joke_id'].apply(lambda x: x - 1)

In [None]:
#keep the first 10000 as the test data
test_data = data.head(10000).copy()

In [None]:
#set the ratings that we want to predict to 0
data.iloc[0:10000, 2] = 0

In [None]:
#construct the Rating Matrix, rows will be the vectors of ratings for each user, set missing ratings to 0
rating_matrix = data.pivot_table(index='user_id', columns='joke_id', values='Rating')
rating_matrix.fillna(0, inplace=True)

In [None]:
display(rating_matrix.head())

joke_id,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,138
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.4535,-9.281,0.0,-6.781,0.875,-9.656,-9.031,-7.469,-8.719,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-9.688,9.938,9.531,9.938,0.406,0.0,9.656,0.0,0.0,-9.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,-7.219,-2.031,-9.938,0.0,0.0,-9.812,-9.781,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.5935,0.0,-5.906,0.0,0.0,3.875,6.219,0.0,6.094,5.406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.031,0.0,0.0,7.5,-7.219,0.0,0.0,0.0,3.656,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
rating_matrix.shape

(40863, 139)

#Making predictions with the system

In [None]:
def UA(A, u):
    return user_means[u] #The UA function takes the rating matrix 'A' and a user index 'u' and returns the mean rating for that user.

def JA(A, j): #The JA function takes the rating matrix 'A' and a joke index 'j' and returns the mean rating for that joke.
    return joke_means[j]

In [None]:
import time
import math
from sklearn.metrics import mean_squared_error as mse

In [None]:
# Calculating predictions based on user means
start_time = time.time()
user_means = rating_matrix[rating_matrix != 0].mean(axis=1)
predictions = []
for i in range(10000):
    row = test_data.iloc[i]
    prediction = UA(rating_matrix, row['user_id'])
    predictions.append(prediction)
print("User Average RMSE: ", math.sqrt(mse(predictions, test_data['Rating'])))
print("%s seconds for UA ---" % (time.time() - start_time))

User Average RMSE:  4.252190863944161
2.067582130432129 seconds for UA ---


In [None]:
## Calculating predictions based on joke means
start_time = time.time()
joke_means = rating_matrix[rating_matrix!=0].mean(axis=0)
predictions = []
for i in range(10000):
    row = test_data.iloc[i]
    prediction = JA(rating_matrix, row['joke_id'])
    predictions.append(prediction)
print("Joke Average RMSE: ", math.sqrt(mse(predictions, test_data['Rating'])))
print("%s seconds for JA ---" % (time.time() - start_time))

Joke Average RMSE:  4.9829432435858525
1.2043373584747314 seconds for JA ---


#Collaborative Filtering

In [None]:
#convert the rating matrix into a sparse matrix in csr format
from scipy.sparse import csr_matrix
R = csr_matrix(rating_matrix)

In [None]:
def UCF(A, k, u, j):
    # Get vector of user u and find indices of users that rated item j
    users = np.nonzero(A[:, j])[0]
    u_vector = A[u, :]

    # Compute similarities between users that rated item j and user u
    from sklearn.metrics.pairwise import cosine_similarity as cs
    vectors = A[users]
    similarities = cs(vectors, u_vector).flatten()

    return similarities

In [None]:
similarities = UCF(R, 3, 0, 3)  # Compute similarities for user 0 and item 3
print("Similarities:", similarities)

Similarities: [1.         0.07593692 0.14580496 ... 0.09787077 0.11382798 0.25875385]


In [None]:
def UCF(A, k, u, j):
    # Assuming 'vectors' and other necessary variables are defined somewhere in your code
    scores = vectors[:, j].data
    similarities = cs(vectors, u_vector).flatten() #Calculate similarities here based on your algorithm
    combined = np.column_stack((similarities, scores))
    sorted_indices = np.argsort(combined[:, 0])[::-1]  # Sort indices in descending order of similarities
    sorted_combined = combined[sorted_indices]
    sorted_similarities = sorted_combined[:, 0]
    sorted_scores = sorted_combined[:, 1]

    # Limit similar users to at most k
    sorted_similarities = sorted_similarities[0:k]
    sorted_scores = sorted_scores[0:k]

    # Calculate and return prediction
    s = np.absolute(sorted_similarities).sum()
    if s == 0:
        return user_means[u] # Assuming 'user_means' is defined
    else:
        return np.dot(sorted_similarities.T, sorted_scores) / s

In [None]:
#similar users should be at most k
def UCF(A, k, u, j,combined,lsts,l1,l2,s):
    combined = combined[0 : k]
    lsts = list(zip(*combined))
    l1 = np.array(lsts[0])
    l2 = np.array(lsts[1])
    s = np.absolute(l1).sum()

In [None]:
def predict_score(l1, l2, s, user_means, u):
    if s == 0:
        return user_means[u]
    else:
        return np.dot(l1.T, l2) / s

In [65]:
ks = [10,25,50,75,100]
error = []

for k in ks:
    predictions = []
    start_time = time.time()
    for i in range(10000):
        row = test_data.iloc[i]  # Assuming test_data is a pandas DataFrame
        predictions.append (UCF(R, k, row['user_id'], row['joke_id']))
    print("--- %s seconds for %s similar users ---" % (time.time() - start_time, k))
    er = math.sqrt(mse(predictions, test_data['Rating']))
    error.append(er)
    print("Mean Squared Error: %s" % (er))
    print('---------------------------')

TypeError: UCF() missing 5 required positional arguments: 'combined', 'lsts', 'l1', 'l2', and 's'

In [None]:
UCF