In [146]:
''' Step 1: Load the training data'''

def load_data(training_path, testing_path):
    train_data = []
    test_data = []

    ''' Load the training dataset:'''
    with open(training_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            userid, itemid, rating, _ = row
            userid = int(userid)
            itemid = int(itemid)
            rating = float(rating)
            train_data.append([userid, itemid, rating])

    ''' Load the testing dataset:'''
    with open (testing_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            userid, itemid, _ = row
            userid = int(userid)
            itemid = int(itemid)
            test_data.append([int(userid), int(itemid)])

    # Split the data into training and validation sets (80% training, 20% validation), without using external libraries:
    random.shuffle(train_data)
    split_index = int(0.8 * len(train_data))
    train_data, val_data = train_data[:split_index], train_data[split_index:]


    return train_data, val_data, test_data

In [None]:
''' Alternative Step 1 without importing the csv library (CITE stackoverflow solution oops)'''
def load_data_withoutcsv(training_path, testing_path):
    linestraining = [line.rstrip('\n') for line in open(training_path)]
    train_data = []
    for line in linestraining: 
        words = line.split(',')
        userid = int(words[0])
        itemid = int(words[1])
        rating = float(words[2])
        train_data.append([userid, itemid, rating])
    
    test_data = []
    linestesting = [line.rstrip('\n') for line in open(testing_path)]
    for line in linestesting:
        words = line.split(',')
        userid = int(words[0])
        itemid = int(words[1])
        test_data.append([userid, itemid])

    random.shuffle(train_data)
    split_index = int(0.8 * len(train_data))
    train_data, val_data = train_data[:split_index], train_data[split_index:]

    return train_data, val_data, test_data


In [148]:
''' Step 2: Build the User-Item Rating Matrix'''
def build_user_item_matrix(train_data):
    users = sorted(set([d[0] for d in train_data]))
    items = sorted(set([d[1] for d in train_data]))

    user_map = {u: i for i, u in enumerate(users)}
    item_map = {i: j for j, i in enumerate(items)}

    matrix = np.zeros((len(users), len(items)))
    for user, item, rating in train_data:
        matrix[user_map[user], item_map[item]] = rating

    return matrix, user_map, item_map

In [149]:
''' Step 3: Calculate the Similiarity between users'''
''' User-based Collaborative Filtering using Pearson Correlation'''
def userpearson_similarity(matrix):
    num_users, num_items= matrix.shape
    similarity = np.zeros((num_users, num_users))
    for u1 in range(num_users):
        for u2 in range (u1+1, num_users):
            # Find common items rated by both users:
            common_items = np.where((matrix[u1] > 0) & (matrix[u2] > 0))[0]
            # Make sure there are at least 2 common items:
            if len(common_items) > 1:
                ratings_u1 = matrix[u1, common_items]
                ratings_u2 = matrix[u2, common_items]

                mean_u1 = np.mean(ratings_u1)
                mean_u2 = np.mean(ratings_u2)

                # Compute Pearson Correlation:
                numerator = np.sum((ratings_u1 - mean_u1) * (ratings_u2 - mean_u2))
                denominator = np.sqrt(np.sum((ratings_u1 - mean_u1)**2) * np.sum((ratings_u2 - mean_u2)**2))
                similarity[u1, u2] = numerator / denominator if denominator != 0 else 0

    return similarity

''' Item-based Collaborative filter using Pearson Similarity'''
def itempearson_similarity(matrix):
    num_items = matrix.shape[1]
    similarity = np.zeros((num_items, num_items))

    user_means = np.true_divide(matrix.sum(axis = 1), (matrix != 0).sum(axis = 1), where = (matrix != 0).sum(axis = 1) != 0)

    for i1 in range(num_items):
        for i2 in range (i1 + 1, num_items):
            common_users = np.where((matrix[:, i1] > 0) & (matrix[:, i2] > 0))[0]

            if len(common_users) > 1: 
                ratings_i1 = matrix[common_users, i1] - user_means[common_users]
                ratings_i2 = matrix[common_users, i2] - user_means[common_users]

                numerator = np.sum(ratings_i1 * ratings_i2)
                denominator = np.sqrt(np.sum(ratings_i1**2) * np.sum(ratings_i2**2))

                similarity[i1, i2] = numerator / denominator if denominator != 0 else 0

    return similarity

''' User-Based Cosine Similarity'''
def cosine_similarity(matrix):
    norm = np.linalg.norm(matrix, axis = 1)
    norm[norm == 0] = 1
    normalized_matrix = matrix / norm[:, None]
    similarity = np.dot(normalized_matrix, normalized_matrix.T)

    return similarity



    

In [150]:
''' Step 4: Predict Ratings using Similarity from Step 3'''
def predict_ratings(userid, itemid, matrix, similarity, user_map, item_map, k=5, similarity_threshold = 0.1):
    # Neighbourhood Selection:
    if userid not in user_map or itemid not in item_map:
        return 3.0
    
    user_index = user_map[userid]
    item_index = item_map[itemid]

    # Compute similarity scores:
    user_similarity_scores = similarity[user_index]

    # Find users who have rated the item:
    rated_item_users = np.where(matrix[:, item_index] > 0)[0]

    if len(rated_item_users) == 0:
        return 3.0
    
    # If users found, sort users by similarity and select top-k:
    sorted_users = rated_item_users[np.argsort(user_similarity_scores[rated_item_users])[::-1]]
    top_k_users = [user for user in sorted_users if user_similarity_scores[user] > similarity_threshold][:k]

    # Compute weighted sum of ratings:
    numerator = sum([similarity[user_index, user] * matrix[user, item_index] for user in top_k_users])
    denominator = sum([similarity[user_index, user] for user in top_k_users])
    weighted_sum = numerator/denominator if denominator != 0 else 3.0
    return weighted_sum
                      



In [151]:
''' Step 5: Generate the Predictions'''
def gen_preds(test_dataset, matrix, similarity, user_map, item_map, output ):
    preds = []
    for user, item in test_dataset:
        pred = predict_ratings(user, item, matrix, similarity, user_map, item_map)
        pred = min(max(pred, 0.5), 5.0)
        preds.append([user, item, round(pred, 3)])

    with open(output, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['userid', 'itemid', 'predicted_rating'])
        writer.writerows(preds)

In [152]:
''' Helper functions for evaluation'''
def MAE(y_pred, y_true):
    return np.mean(np.abs(np.array(y_pred) - np.array(y_true)))

def RMSE(y_pred, y_true):
    return np.sqrt(np.mean((np.array(y_pred) - np.array(y_true))**2))


    

In [153]:
''' Helper Functions to Improve Performance'''
# 1. Significance Weighting 
def significance_weight(similarity_matrix, common_items, threshold = 50):
    weight_factor = np.minimum(1, common_items) / threshold
    return similarity_matrix * weight_factor

# 2. Case Amplification to enhance strong neighbours
def case_amp(similarity_matrix, alpha = 2.5):
    return np.sign(similarity_matrix) * np.abs(similarity_matrix)**alpha



In [154]:
''' Step 6: Run the Code'''
import csv
import numpy as np
import random
train_dataset =  'C:/Users/semel/Downloads/socialcomp/train_100k_withratings.csv'
test_dataset =  'C:/Users/semel/Downloads/socialcomp/test_100k_withoutratings.csv'
# Create the output file:
output = 'output.csv'

train_data, validation_data, test_data = load_data_withoutcsv(train_dataset, test_dataset)
print ('Datasets found, data loaded')

matrix, user_map, item_map = build_user_item_matrix(train_data)
print ('User-item matrix created')

similarity = cosine_similarity(matrix)
print ('Similarity computed')
#similarity = significance_weight(similarity, 50)
#similarity = case_amp(similarity, 2.5)

val_true = []
val_pred = []

# Since we don't have the ground truth ratings for the test set, we split the training set into training and validation sets:
for user, item , rating in validation_data:
    predicted_rating = predict_ratings(user, item, matrix, similarity, user_map, item_map)
    val_true.append(rating)
    val_pred.append(predicted_rating)

validation_MAE = MAE(val_pred, val_true)
validation_RMSE = RMSE(val_pred, val_true)
print ('Validation MAE:', round(validation_MAE, 3))
print ('Validation RMSE:', round(validation_RMSE, 3))



Datasets found, data loaded
User-item matrix created
Similarity computed
Validation MAE: 0.851
Validation RMSE: 1.076


In [155]:
''' Step 7: Once the model is validated, train on the entire dataset and generate predictions
matrix, user_map, item_map = build_user_item_matrix(train_data + validation_data)
similarity = pearson_coefficient(matrix)
gen_preds(test_data, matrix, similarity, user_map, item_map, output)
print (f"Predictions generated and saved to: {output}") '''

' Step 7: Once the model is validated, train on the entire dataset and generate predictions\nmatrix, user_map, item_map = build_user_item_matrix(train_data + validation_data)\nsimilarity = pearson_coefficient(matrix)\ngen_preds(test_data, matrix, similarity, user_map, item_map, output)\nprint (f"Predictions generated and saved to: {output}") '