In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine,euclidean

In [63]:
data = pd.read_csv('course_ratings.csv')
data.shape

(233306, 3)

In [70]:
ratings_sparse_df = data.pivot(index='user', columns='item', values='rating').fillna(0).reset_index().rename_axis(index=None, columns=None)
ratings_sparse_df.head()

Unnamed: 0,user,AI0111EN,BC0101EN,BC0201EN,BC0202EN,BD0101EN,BD0111EN,BD0115EN,BD0121EN,BD0123EN,...,SW0201EN,TA0105,TA0105EN,TA0106EN,TMP0101EN,TMP0105EN,TMP0106,TMP107,WA0101EN,WA0103EN
0,2,0.0,4.0,0.0,0.0,5.0,4.0,0.0,5.0,3.0,...,0.0,5.0,0.0,4.0,0.0,3.0,3.0,0.0,5.0,0.0
1,4,0.0,0.0,0.0,0.0,5.0,3.0,4.0,5.0,3.0,...,0.0,4.0,0.0,0.0,0.0,3.0,3.0,0.0,3.0,3.0
2,5,3.0,5.0,5.0,0.0,4.0,0.0,0.0,0.0,3.0,...,0.0,0.0,4.0,4.0,4.0,4.0,4.0,5.0,0.0,3.0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
trainset, testset = train_test_split(data, test_size=0.2)

In [66]:
# calculate users similarity matrix for users in trainset

In [67]:
# Create user id to index and index to id mappings
def get_users_dicts(rating_df):
    # Group the DataFrame by course, and get the maximum value for each group
    grouped_df = rating_df.groupby(['user']).max().reset_index(drop=False)
    # Create a dictionary mapping indices to course IDs
    idx_id_dict = grouped_df[['user']].to_dict()['user']
    # Create a dictionary mapping course IDs to indices
    id_idx_dict = {v: k for k, v in idx_id_dict.items()}
    # Clean up temporary DataFrame
    del grouped_df
    return idx_id_dict, id_idx_dict

In [69]:
idx_id_dict, id_idx_dict = get_users_dicts(trainset)
idx_id_dict

{0: 2,
 1: 4,
 2: 5,
 3: 7,
 4: 8,
 5: 9,
 6: 12,
 7: 16,
 8: 17,
 9: 19,
 10: 20,
 11: 21,
 12: 22,
 13: 23,
 14: 25,
 15: 26,
 16: 27,
 17: 28,
 18: 29,
 19: 30,
 20: 34,
 21: 35,
 22: 36,
 23: 38,
 24: 39,
 25: 40,
 26: 41,
 27: 42,
 28: 43,
 29: 44,
 30: 45,
 31: 46,
 32: 47,
 33: 48,
 34: 51,
 35: 52,
 36: 53,
 37: 54,
 38: 55,
 39: 56,
 40: 57,
 41: 58,
 42: 59,
 43: 60,
 44: 61,
 45: 62,
 46: 63,
 47: 64,
 48: 65,
 49: 66,
 50: 67,
 51: 68,
 52: 78,
 53: 79,
 54: 80,
 55: 93,
 56: 102,
 57: 103,
 58: 104,
 59: 106,
 60: 107,
 61: 108,
 62: 110,
 63: 111,
 64: 112,
 65: 113,
 66: 114,
 67: 116,
 68: 117,
 69: 118,
 70: 119,
 71: 120,
 72: 122,
 73: 123,
 74: 131,
 75: 132,
 76: 133,
 77: 134,
 78: 135,
 79: 136,
 80: 137,
 81: 138,
 82: 146,
 83: 147,
 84: 154,
 85: 452,
 86: 589,
 87: 704,
 88: 768,
 89: 866,
 90: 1321,
 91: 1383,
 92: 1938,
 93: 2191,
 94: 2674,
 95: 3087,
 96: 3264,
 97: 3450,
 98: 4930,
 99: 5053,
 100: 5555,
 101: 6002,
 102: 6092,
 103: 6449,
 104: 6847,
 1

In [71]:
# calculate users similarity matrix

user_idxs = idx_id_dict.keys()
sim_matrix = np.empty(shape=(1000, 1000)) #np.empty(shape=(len(user_idxs),len(user_idxs)))

for id1 in range(0, 1000): #user_idxs:
    user1 = rating_sparse_df.iloc[id1,1:]
    for id2 in range(0, 1000): #user_idxs:
        user2 = rating_sparse_df.iloc[id2,1:]  
        sim_matrix[id1,id2] = round(1 - cosine(user1.values, user2.values), 2)


In [61]:
# find k-nearest kneighboor for each user
# user_idxs = []
# user_five_nearest = []
# for user_idx in range(0,sim_matrix.shape[0]):
#     user_sim = sim_matrix[user_idx,:].argsort()
#     user_sim = user_sim[::-1]
#     user_sim
#     five_nearest = user_sim[1:6]
#     five_nearest
#     user_idxs.append(user_idx)
#     user_five_nearest.append(five_nearest)
    

In [176]:
# get five neighboor for one user
def get_five_neighboor(user_idx):
    user_sim = sim_matrix[user_idx,:].argsort()
    user_sim = user_sim[::-1]
    neighboors = user_sim[1:6]
    return neighboors

In [177]:
# get knn_sim for five nearest neighboor for one user
def get_knn_sim(user_idx, neighboor):
    knn_sim = np.zeros(shape=neighboor.shape)
    i = 0
    for n_index in neighboor:
        knn_sim[i] = sim_matrix[user_idx, n_index]
        i+=1
    return knn_sim   

In [178]:
# get course rating from five nearest neighboor for one user
def get_knn_ratings(course_id, neighboor):
    knn_ratings = np.zeros(shape=neighboor.shape)
    i = 0
    for n_index in neighboor:
        knn_ratings[i] = ratings_sparse_df.loc[n_index, course_id]
        i+=1
    return knn_ratings 

In [179]:
test_user_idxs = [100, 200, 300, 400, 500, 600, 700, 800, 900]
# courses ratings
courses = []
ratings = []
users = []

for test_user_idx in test_user_idxs:
    # find user neighboors
    five_neighboor = get_five_neighboor(test_user_idx)
    # find knn similarity
    knn_sims = get_knn_sim(test_user_idx, five_neighboor)
    # get user id
    user_id = idx_id_dict[test_user_idx]
   
    # get courses user not rating
    user_ratings_courses = ratings_sparse_df[ratings_sparse_df['user'] == user_id]
    user_ratings_courses
    
    # for each urating course
    for col in user_ratings_courses:
        if col != 'user':
            if user_ratings_courses.iloc[0][col] == 0:
                knn_ratings = get_knn_ratings(col, five_neighboor)
                r_test_user = round(np.dot(knn_sims, knn_ratings)/ sum(knn_sims), 2)
                if r_test_user>2:
                    users.append(user_id)
                    courses.append(col)
                    ratings.append(r_test_user)

In [180]:
user_course_ratings_df = pd.DataFrame(list(zip(users, courses, ratings)), columns=['User', 'Course' , 'Ratings' ])
user_course_ratings_df

Unnamed: 0,User,Course,Ratings
0,5555,BD0111EN,3.4
1,37936,ML0122ENv1,4.2
2,73660,DA0101EN,4.37
3,73660,PY0101EN,4.22
4,73660,ST0101EN,2.32
5,98471,BD0101EN,2.42
6,98471,BD0115EN,4.19
7,98471,BD0131EN,2.86
8,98471,BD0141EN,2.4
9,98471,DA0101EN,3.42
