In [112]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [113]:
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')


print(ratings_df.shape)
print(users_df.shape)
print(movies_df.shape)

full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')
test_ratings_df = test_ratings_df.merge(users_df, on='user_id', how='left').merge(movies_df, on='item_id', how='left')


# Check the new shape of the merged dataframe to ensure completeness
print(full_df.shape)

(89992, 3)
(943, 4)
(1681, 4)
(89992, 9)


In [114]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()


In [118]:
# Train SVD model
algo = SVD()
algo.fit(trainset)

# Number of latent factors
n_factors = algo.n_factors

# Dimensions of the user factors matrix
n_users = algo.trainset.n_users
user_factors_shape = (n_users, n_factors)

# Dimensions of the item factors matrix
n_items = algo.trainset.n_items
item_factors_shape = (n_items, n_factors)

print("Shape of user factors matrix:", user_factors_shape)
print("Shape of item factors matrix:", item_factors_shape)



Shape of user factors matrix: (943, 100)
Shape of item factors matrix: (1662, 100)


In [120]:
def get_features(info, algo, trainset):
    default_features = np.zeros(algo.n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    user_meta = [info['age'], info['is_male']]
    item_meta = [info['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features


In [129]:
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X = np.stack(full_df['features'].values)
y = full_df['label'].values
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)





KeyError: 'label'

In [None]:
# Merge datasets
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')

# Assuming get_features is defined correctly
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)

# Check for any NaNs in features just in case
if full_df['features'].isnull().any():
    raise ValueError("NaN values found in features. Check get_features implementation.")

# Create labels for classification
full_df['label'] = (full_df['rating'] >= 4.5).astype(int)

# Prepare classification dataset
X = np.stack(full_df['features'].values)
y = full_df['label'].values

# Split data for training and testing with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Optionally, check model convergence
if not clf.n_iter_ < clf.max_iter:
    print("Logistic regression did not converge. Consider increasing max_iter or adjusting other parameters.")


In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [None]:
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')


print(ratings_df.shape)
print(users_df.shape)
print(movies_df.shape)

full_df = ratings_df.merge(users_df, left_on='user_id', right_on='user_id', how='inner')
full_df = full_df.merge(movies_df, left_on='item_id', right_on='item_id', how='inner')

# Check the new shape of the merged dataframe to ensure completeness
print(full_df.shape)

(89992, 3)
(943, 4)
(1681, 4)
(89992, 9)


In [None]:
# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()

In [None]:
# Train SVD model
algo = SVD()
algo.fit(trainset)

# Number of latent factors
n_factors = algo.n_factors

# Dimensions of the user factors matrix
n_users = algo.trainset.n_users
user_factors_shape = (n_users, n_factors)

# Dimensions of the item factors matrix
n_items = algo.trainset.n_items
item_factors_shape = (n_items, n_factors)

print("Shape of user factors matrix:", user_factors_shape)
print("Shape of item factors matrix:", item_factors_shape)



Shape of user factors matrix: (943, 100)
Shape of item factors matrix: (1662, 100)


In [None]:
def get_features(info, algo, trainset):
    default_features = np.zeros(algo.n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    user_meta = [info['age'], info['is_male']]
    item_meta = [info['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features


In [130]:
# Merge datasets
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')

# Assuming get_features is defined correctly
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)

# Check for any NaNs in features just in case
if full_df['features'].isnull().any():
    raise ValueError("NaN values found in features. Check get_features implementation.")

# Create labels for classification
full_df['label'] = (full_df['rating'] >= 4.5).astype(int)

# Prepare classification dataset
X = np.stack(full_df['features'].values)
y = full_df['label'].values

# Split data for training and testing with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Optionally, check model convergence
if not clf.n_iter_ < clf.max_iter:
    print("Logistic regression did not converge. Consider increasing max_iter or adjusting other parameters.")


In [None]:
# Predict probabilities and calculate AUC
y_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc}")

# Classification report and confusion matrix
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Evaluate SVD model
svd_eval = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD Model Evaluation:", svd_eval)

AUC Score: 0.6874242506426829
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     14186
           1       0.54      0.07      0.12      3813

    accuracy                           0.79     17999
   macro avg       0.67      0.53      0.50     17999
weighted avg       0.74      0.79      0.72     17999

Confusion Matrix:
[[13959   227]
 [ 3545   268]]
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9460  0.9400  0.9419  0.9419  0.9412  0.9422  0.0020  
MAE (testset)     0.7443  0.7418  0.7448  0.7420  0.7443  0.7434  0.0013  
Fit time          0.58    0.56    0.51    0.53    0.55    0.55    0.02    
Test time         0.05    0.05    0.05    0.23    0.05    0.08    0.07    
SVD Model Evaluation: {'test_rmse': array([0.94598329, 0.94000225, 0.94188795, 0.94191631, 0.94122541]), 'test_mae': array([0.74426403, 0.7417

In [None]:
def recommend_items(user_id, item_pool, model, N=10):
    known_items = []
    items_features = []

    # Collect features of known items
    for item in item_pool:
        if trainset.knows_item(item):
            known_items.append(item)
            info = {
                'user_id': user_id,
                'item_id': item,
                'age': users_df.loc[users_df['user_id'] == user_id, 'age'].iloc[0],
                'is_male': users_df.loc[users_df['user_id'] == user_id, 'is_male'].iloc[0],
                'release_year': movies_df.loc[movies_df['item_id'] == item, 'release_year'].iloc[0]
            }
            features = get_features(info, algo, trainset)
            items_features.append(features)
        else:
            print(f"Item {item} is not known to the training set and will be skipped.")

    # Handle cases where no known items are found
    if not items_features:
        print("Fallback to popular items.")
        # Select top N popular items
        popular_items = ratings_df['item_id'].value_counts().head(N).index.tolist()
        return popular_items, [np.nan]*N  # No probabilities available for these items

    items_features = np.array(items_features)
    probabilities = model.predict_proba(items_features)[:, 1]
    top_indices = np.argsort(probabilities)[-N:]

    recommended_items = [known_items[i] for i in top_indices if i < len(known_items)]
    filtered_probabilities = [probabilities[i] for i in top_indices if i < len(known_items)]

    return recommended_items, filtered_probabilities


In [None]:
# Example usage of recommendations
item_pool = movies_df['item_id'].unique()
try:
    recommended_items, scores = recommend_items(4, item_pool, clf, N=5)
    print("Recommended Items:", recommended_items)
    print("Scores:", scores)
except Exception as e:
    print(f"An error occurred while making recommendations: {str(e)}")

An error occurred while making recommendations: Item 709 is not part of the trainset.


In [None]:
# Read the file using pandas
test_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')
# Convert DataFrame to list of tuples
pairs = list(test_df.itertuples(index=False, name=None))

In [None]:
# Predict probabilities and calculate AUC
y_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc}")

# Classification report and confusion matrix
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Evaluate SVD model
svd_eval = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD Model Evaluation:", svd_eval)

AUC Score: 0.6874242506426829
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     14186
           1       0.54      0.07      0.12      3813

    accuracy                           0.79     17999
   macro avg       0.67      0.53      0.50     17999
weighted avg       0.74      0.79      0.72     17999

Confusion Matrix:
[[13959   227]
 [ 3545   268]]
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9460  0.9400  0.9419  0.9419  0.9412  0.9422  0.0020  
MAE (testset)     0.7443  0.7418  0.7448  0.7420  0.7443  0.7434  0.0013  
Fit time          0.58    0.56    0.51    0.53    0.55    0.55    0.02    
Test time         0.05    0.05    0.05    0.23    0.05    0.08    0.07    
SVD Model Evaluation: {'test_rmse': array([0.94598329, 0.94000225, 0.94188795, 0.94191631, 0.94122541]), 'test_mae': array([0.74426403, 0.7417

In [None]:
def recommend_items(user_id, item_pool, model, N=10):
    known_items = []
    items_features = []

    # Collect features of known items
    for item in item_pool:
        if trainset.knows_item(item):
            known_items.append(item)
            info = { 
                'user_id': user_id,
                'item_id': item,
                'age': users_df.loc[users_df['user_id'] == user_id, 'age'].iloc[0],
                'is_male': users_df.loc[users_df['user_id'] == user_id, 'is_male'].iloc[0],
                'release_year': movies_df.loc[movies_df['item_id'] == item, 'release_year'].iloc[0]
            }
            features = get_features(info, algo, trainset)
            items_features.append(features)
        else:
            print(f"Item {item} is not known to the training set and will be skipped.")

    # Handle cases where no known items are found
    if not items_features:
        print("Fallback to popular items.")
        # Select top N popular items
        popular_items = ratings_df['item_id'].value_counts().head(N).index.tolist()
        return popular_items, [np.nan]*N  # No probabilities available for these items

    items_features = np.array(items_features)
    probabilities = model.predict_proba(items_features)[:, 1]
    top_indices = np.argsort(probabilities)[-N:]

    recommended_items = [known_items[i] for i in top_indices if i < len(known_items)]
    filtered_probabilities = [probabilities[i] for i in top_indices if i < len(known_items)]

    return recommended_items, filtered_probabilities


In [None]:
# Example usage of recommendations
item_pool = movies_df['item_id'].unique()
try:
    recommended_items, scores = recommend_items(4, item_pool, clf, N=5)
    print("Recommended Items:", recommended_items)
    print("Scores:", scores)
except Exception as e:
    print(f"An error occurred while making recommendations: {str(e)}")

An error occurred while making recommendations: Item 709 is not part of the trainset.


In [127]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Load datasets
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Merge datasets for features generation
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')
test_ratings_df = test_ratings_df.merge(users_df, on='user_id', how='left').merge(movies_df, on='item_id', how='left')

# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Define a function to extract features
def get_features(info, algo, trainset):
    default_features = np.zeros(algo.n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    user_meta = [info['age'], info['is_male']]
    item_meta = [info['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features

# Generate features for the training set
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X = np.stack(full_df['features'].values)
y = (full_df['rating'] >= 4.5).astype(int)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

# Generate features for the test set
test_ratings_df['features'] = test_ratings_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X_test = np.stack(test_ratings_df['features'].values)

# Predict with logistic regression
y_proba = clf.predict_proba(X_test)[:, 1]
test_ratings_df['predicted_rating'] = y_proba

# Output predictions
print(test_ratings_df[['user_id', 'item_id', 'predicted_rating']])

# Evaluate SVD model using cross-validation
svd_eval = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD Model Evaluation:", svd_eval)

# Recommendation function (remains the same)
def recommend_items(user_id, item_pool, model, N=10):
    known_items = []
    items_features = []

    # Collect features of known items
    for item in item_pool:
        if trainset.knows_item(item):
            known_items.append(item)
            info = { 
                'user_id': user_id,
                'item_id': item,
                'age': users_df.loc[users_df['user_id'] == user_id, 'age'].iloc[0],
                'is_male': users_df.loc[users_df['user_id'] == user_id, 'is_male'].iloc[0],
                'release_year': movies_df.loc[movies_df['item_id'] == item, 'release_year'].iloc[0]
            }
            features = get_features(info, algo, trainset)
            items_features.append(features)
        else:
            print(f"Item {item} is not known to the training set and will be skipped.")

    # Handle cases where no known items are found
    if not items_features:
        print("Fallback to popular items.")
        # Select top N popular items
        popular_items = ratings_df['item_id'].value_counts().head(N).index.tolist()
        return popular_items, [np.nan]*N  # No probabilities available for these items

    items_features = np.array(items_features)
    probabilities = model.predict_proba(items_features)[:, 1]
    top_indices = np.argsort(probabilities)[-N:]
    recommended_items = [known_items[i] for i in top_indices if i < len(known_items)]
    filtered_probabilities = [probabilities[i] for i in top_indices if i < len(known_items)]

    return recommended_items, filtered_probabilities

# Example usage of recommendations
item_pool = movies_df['item_id'].unique()
try:
    recommended_items, scores = recommend_items(4, item_pool, clf, N=5)
    print("Recommended Items:", recommended_items)
    print("Scores:", scores)
except Exception as e:
    print(f"An error occurred while making recommendations: {str(e)}")


ValueError: Item 1572 is not part of the trainset.

In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [None]:
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')


print(ratings_df.shape)
print(users_df.shape)
print(movies_df.shape)

full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')
test_ratings_df = test_ratings_df.merge(users_df, on='user_id', how='left').merge(movies_df, on='item_id', how='left')


# Check the new shape of the merged dataframe to ensure completeness
print(full_df.shape)

(89992, 3)
(943, 4)
(1681, 4)
(89992, 9)


In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()


In [None]:
# Train SVD model
algo = SVD()
algo.fit(trainset)

# Number of latent factors
n_factors = algo.n_factors

# Dimensions of the user factors matrix
n_users = algo.trainset.n_users
user_factors_shape = (n_users, n_factors)

# Dimensions of the item factors matrix
n_items = algo.trainset.n_items
item_factors_shape = (n_items, n_factors)

print("Shape of user factors matrix:", user_factors_shape)
print("Shape of item factors matrix:", item_factors_shape)



Shape of user factors matrix: (943, 100)
Shape of item factors matrix: (1662, 100)


In [None]:
def get_features(info, algo, trainset):
    default_features = np.zeros(algo.n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    user_meta = [info['age'], info['is_male']]
    item_meta = [info['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features


In [None]:
# Merge datasets
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')

# Assuming get_features is defined correctly
full_df['features'] = full_df.apply(lambda row: get_features(row, algo, trainset), axis=1)

# Check for any NaNs in features just in case
if full_df['features'].isnull().any():
    raise ValueError("NaN values found in features. Check get_features implementation.")

# Create labels for classification
full_df['label'] = (full_df['rating'] >= 4.5).astype(int)

# Prepare classification dataset
X = np.stack(full_df['features'].values)
y = full_df['label'].values

# Split data for training and testing with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Optionally, check model convergence
if not clf.n_iter_ < clf.max_iter:
    print("Logistic regression did not converge. Consider increasing max_iter or adjusting other parameters.")


In [None]:
# Predict probabilities and calculate AUC
y_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc}")

# Classification report and confusion matrix
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Evaluate SVD model
svd_eval = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD Model Evaluation:", svd_eval)

AUC Score: 0.6874242506426829
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     14186
           1       0.54      0.07      0.12      3813

    accuracy                           0.79     17999
   macro avg       0.67      0.53      0.50     17999
weighted avg       0.74      0.79      0.72     17999

Confusion Matrix:
[[13959   227]
 [ 3545   268]]
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9460  0.9400  0.9419  0.9419  0.9412  0.9422  0.0020  
MAE (testset)     0.7443  0.7418  0.7448  0.7420  0.7443  0.7434  0.0013  
Fit time          0.58    0.56    0.51    0.53    0.55    0.55    0.02    
Test time         0.05    0.05    0.05    0.23    0.05    0.08    0.07    
SVD Model Evaluation: {'test_rmse': array([0.94598329, 0.94000225, 0.94188795, 0.94191631, 0.94122541]), 'test_mae': array([0.74426403, 0.7417

In [None]:
def recommend_items(user_id, item_pool, model, N=10):
    known_items = []
    items_features = []

    # Collect features of known items
    for item in item_pool:
        if trainset.knows_item(item):
            known_items.append(item)
            info = { 
                'user_id': user_id,
                'item_id': item,
                'age': users_df.loc[users_df['user_id'] == user_id, 'age'].iloc[0],
                'is_male': users_df.loc[users_df['user_id'] == user_id, 'is_male'].iloc[0],
                'release_year': movies_df.loc[movies_df['item_id'] == item, 'release_year'].iloc[0]
            }
            features = get_features(info, algo, trainset)
            items_features.append(features)
        else:
            print(f"Item {item} is not known to the training set and will be skipped.")

    # Handle cases where no known items are found
    if not items_features:
        print("Fallback to popular items.")
        # Select top N popular items
        popular_items = ratings_df['item_id'].value_counts().head(N).index.tolist()
        return popular_items, [np.nan]*N  # No probabilities available for these items

    items_features = np.array(items_features)
    probabilities = model.predict_proba(items_features)[:, 1]
    top_indices = np.argsort(probabilities)[-N:]

    recommended_items = [known_items[i] for i in top_indices if i < len(known_items)]
    filtered_probabilities = [probabilities[i] for i in top_indices if i < len(known_items)]

    return recommended_items, filtered_probabilities


In [None]:
# Example usage of recommendations
item_pool = movies_df['item_id'].unique()
try:
    recommended_items, scores = recommend_items(4, item_pool, clf, N=5)
    print("Recommended Items:", recommended_items)
    print("Scores:", scores)
except Exception as e:
    print(f"An error occurred while making recommendations: {str(e)}")

An error occurred while making recommendations: Item 709 is not part of the trainset.


In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD

# Load datasets
ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_all_development_set.csv')
users_df = pd.read_csv('../data_movie_lens_100k/user_info.csv')
movies_df = pd.read_csv('../data_movie_lens_100k/movie_info.csv')
test_ratings_df = pd.read_csv('../data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Define a function to extract features
def get_features(info, algo, trainset):
    default_features = np.zeros(algo.n_factors)
    u_features = default_features.copy()
    i_features = default_features.copy()

    if trainset.knows_user(info['user_id']):
        inner_uid = trainset.to_inner_uid(info['user_id'])
        u_features = algo.pu[inner_uid]
    if trainset.knows_item(info['item_id']):
        inner_iid = trainset.to_inner_iid(info['item_id'])
        i_features = algo.qi[inner_iid]

    user_meta = [info['age'], info['is_male']]
    item_meta = [info['release_year']]
    features = np.concatenate([u_features, i_features, user_meta, item_meta])
    return features

# Merge datasets for features generation
full_df = ratings_df.merge(users_df, on='user_id').merge(movies_df, on='item_id')

# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['user_id', 'item_id', 'rating']], reader)
trainset = data.build_full_trainset()

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Generate features for the test set
test_ratings_df['features'] = test_ratings_df.apply(lambda row: get_features(row, algo, trainset), axis=1)
X_test = np.stack(test_ratings_df['features'].values)

# Predict with logistic regression
y_proba = clf.predict_proba(X_test)[:, 1]
test_ratings_df['predicted_rating'] = y_proba

# Save the predictions to a text file
output_file_path = '/mnt/data/ratings_leaderboard_predictions.txt'
test_ratings_df[['user_id', 'item_id', 'predicted_rating']].to_csv(output_file_path, index=False, header=True, sep='\t')

output_file_path


KeyError: 'age'