In [4]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-macosx_11_0_arm64.whl size=1103985 sha256=cfdea6f01bb239850d9d92b703308346f1c0886cf9f907d20cce9e8d2811c9fb
  Stored in directory: /Users/samihakkarainen/Library/Caches/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [16]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Load data
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')

# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Function to get SVD features
def get_features(row, algo, trainset):
    try:
        uid = trainset.to_inner_uid(row['user_id'])
        iid = trainset.to_inner_iid(row['item_id'])
        u_features = algo.pu[uid]
        i_features = algo.qi[iid]
    except:
        u_features = [0]*algo.n_factors
        i_features = [0]*algo.n_factors

    user_meta = [row['age'], row['is_male']]
    item_meta = [row['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features

# Merge datasets
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)

# Create labels for classification
full_df['label'] = (full_df['rating'] >= 4.5).astype(int)

# Prepare classification dataset
X = np.stack(full_df['features'].values)
y = full_df['label'].values

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict probabilities and calculate AUC
y_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc}")

# Evaluate SVD model
svd_eval = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD Model Evaluation:", svd_eval)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [8]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from sklearn.linear_model import LogisticRegression

# Load datasets
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge datasets for features generation
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')
test_ratings_df = test_ratings_df.merge(users_df, on='user_id', how='left').merge(movies_df, on='item_id', how='left')

# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Define a function to extract features
def get_features(info, algo, trainset):
    n_factors = algo.n_factors
    default_features = np.zeros(n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    # Check and retrieve user features if known
    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]

    # Check and retrieve item features if known
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    # Incorporate additional metadata if available
    user_meta = [info.get('age', 0), info.get('is_male', 0)]
    item_meta = [info.get('release_year', 0)]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features






# Generate features for the training set
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X = np.stack(full_df['features'].values)
y = (full_df['rating'] >= 4.5).astype(int)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)


# Generate features for the test set
test_ratings_df['features'] = test_ratings_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X_test = np.stack(test_ratings_df['features'].values)

# Use the logistic regression model to predict probabilities
y_proba = clf.predict_proba(X_test)[:, 1]
test_ratings_df['predicted_rating'] = y_proba

# Save the predictions to a text file
test_ratings_df[['user_id', 'item_id', 'predicted_rating']].to_csv('predicted_leaderboard_ratings.txt', index=False, sep='\t')
print("Predictions saved to 'predicted_leaderboard_ratings.txt'.")



ValueError: Item 1572 is not part of the trainset.