In [11]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505175 sha256=18b1c9d38a02d54e2b8e33379511f49653e94576a841487baeb2accf61c46da3
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [1]:
import numpy as np
import pandas as pd
import re
import ast
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
import pickle

# Step 1: Load and Clean Data

def clean_user_data(file_path):
    """
    Loads and cleans user data, ensuring only valid rows are kept.
    """
    valid_rows = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) == 4:
                valid_rows.append(parts)

    return pd.DataFrame(valid_rows, columns=['user_id', 'age', 'occupation', 'gender'])

def clean_ratings_data(file_path):
    """
    Loads and cleans ratings data.
    """
    df = pd.read_csv(file_path, names=['timestamp', 'user_id', 'movie_id', 'rating'])[['user_id', 'movie_id', 'rating']]
    df['user_id'] = df['user_id'].astype(str)
    df['movie_id'] = df['movie_id'].str.replace('+', ' ')  # Fix formatting
    return df

def clean_movie_data(file_path):
    """
    Loads and cleans movie data, extracting relevant fields.
    """
    rows = []
    pattern = re.compile(r'^([^,]+),([^,]+),(\[.*?\]),([^,]+),([^,]+)$')

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            match = pattern.match(line)
            if match:
                movie_id = match.group(1).replace("+", " ")
                flag = match.group(2)
                genres = match.group(3)
                release_date = match.group(4)
                language = match.group(5)

                try:
                    genre_list = ast.literal_eval(genres)
                    genres = ', '.join([genre['name'] for genre in genre_list]) if isinstance(genre_list, list) else genres
                except (ValueError, SyntaxError):
                    pass

                rows.append([movie_id, flag, genres, release_date, language])

    df = pd.DataFrame(rows, columns=['movie_id', 'flag', 'genres', 'release_date', 'language'])
    df['movie_id'] = df['movie_id'].astype(str)
    return df

def clean_watch_data(file_path):
    """
    Loads and cleans watch history data, extracting watch time.
    """
    df = pd.read_csv(file_path, names=['timestamp', 'user_id', 'movie_id', 'watch_time'])[['user_id', 'movie_id', 'watch_time']]
    df['user_id'] = df['user_id'].astype(str)
    df['movie_id'] = df['movie_id'].str.replace('+', ' ')  # Fix formatting
    df.dropna(subset=['watch_time'], inplace=True)
    df['watch_time'] = pd.to_numeric(df['watch_time'], errors='coerce')
    df['watch_time_normalized'] = (df['watch_time'] / df['watch_time'].max()) * 5  # Normalize to 1-5 scale
    df['rating'] = df['watch_time_normalized'].fillna(2.5)  # Fill missing values with neutral rating
    return df[['user_id', 'movie_id', 'rating']]

# Load and clean data
movie_data_path = "movie_data.csv"
user_data_path = "user_data.csv"
ratings_data_path = "user_rate_short_data.csv"
watch_data_path = "user_watch_short_data.csv"

movies_df = clean_movie_data(movie_data_path)
users_df = clean_user_data(user_data_path)
ratings_df = clean_ratings_data(ratings_data_path)
watch_df = clean_watch_data(watch_data_path)

# Merge explicit ratings with normalized watch time data
combined_ratings_df = pd.concat([ratings_df, watch_df])






In [2]:
movies_df[100:110]

Unnamed: 0,movie_id,flag,genres,release_date,language
100,what a girl wants 2003,False,Comedy,2003-03-27,en
101,b.a.p.s. 1997,False,"Action, Comedy",1997-03-28,en
102,nighthawks 1981,False,"Action, Crime, Thriller",1981-03-17,en
103,friday after next 2002,False,Comedy,2002-11-22,en
104,meet the robinsons 2007,False,"Animation, Comedy, Family",2007-03-23,en
105,the bridge 2006,False,Documentary,2006-10-27,en
106,the rage carrie 2 1999,False,"Horror, Thriller, Science Fiction",1999-03-11,en
107,8 women 2002,False,"Comedy, Thriller, Music, Crime, Mystery",2002-01-08,fr
108,impostor 2001,False,"Action, Science Fiction, Thriller",2001-12-03,en
109,band of outsiders 1964,False,"Crime, Drama, Romance",1964-08-05,fr


In [3]:
users_df.head()

Unnamed: 0,user_id,age,occupation,gender
0,89163,29,college/grad student,M
1,81280,34,homemaker,M
2,105683,33,college/grad student,M
3,89163,29,college/grad student,M
4,81280,34,homemaker,M


In [4]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,13539,the lion king 2 simbas pride 1998,4
1,35499,howls moving castle 2004,5
2,58700,so close 2002,4
3,87247,much ado about nothing 1993,4
4,62758,the arrival of a train at la ciotat 1896,4


In [5]:
watch_df.head()

Unnamed: 0,user_id,movie_id,rating
0,54831,konga 1961,2.5
1,31542,avatar 2009,2.5
2,45759,up 2009,2.5
3,23048,love is all you need 2012,2.5
4,95221,about a boy 2002,2.5


In [6]:
# Step 2: Train SVD Model
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(combined_ratings_df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd_model = SVD()
svd_model.fit(trainset)

# Step 3: Evaluate Model
eval_results = cross_validate(svd_model, data, cv=5, verbose=True)

print("Evaluation Metrics:")
for key, values in eval_results.items():
    print(f"{key}: {np.mean(values):.4f}")

# Save the trained model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd_model, f)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6082  0.6171  0.6166  0.6164  0.6135  0.6144  0.0033  
MAE (testset)     0.3316  0.3395  0.3401  0.3367  0.3349  0.3365  0.0031  
Fit time          0.66    0.65    0.63    0.61    0.60    0.63    0.02    
Test time         0.06    0.06    0.05    0.19    0.05    0.08    0.06    
Evaluation Metrics:
test_rmse: 0.6144
test_mae: 0.3365
fit_time: 0.6290
test_time: 0.0805


In [7]:
# Step 4: Generate Recommendations
def get_recommendations(user_id=None, num_recommendations=10):
    """
    Generate movie recommendations for a given user.
    If no user_id is provided, a random user from the dataset is selected.
    """
    if user_id is None or str(user_id) not in combined_ratings_df['user_id'].unique():
        user_id = np.random.choice(combined_ratings_df['user_id'].unique())
        print(f"User ID not found, using random user: {user_id}")

    # Get movies user has already rated
    rated_movies = combined_ratings_df[combined_ratings_df['user_id'] == str(user_id)]['movie_id'].unique()

    # Get movies not rated by user
    movie_ids = movies_df['movie_id'].unique()
    movies_to_predict = [m for m in movie_ids if m not in rated_movies]

    if len(movies_to_predict) == 0:
        print(f"No new movies to recommend for User {user_id}.")
        return movies_df.sample(num_recommendations)[['movie_id', 'genres']]  # Fallback to popular movies

    # Predict ratings for unseen movies
    predictions = [svd_model.predict(str(user_id), str(movie_id)) for movie_id in movies_to_predict]
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get top recommended movies
    top_movies = [str(pred.iid) for pred in predictions[:num_recommendations]]
    recommended_movies = movies_df[movies_df['movie_id'].isin(top_movies)][['movie_id', 'genres']]

    if recommended_movies.empty:
        print(f"No recommendations found for User {user_id}. Check data alignment.")
    else:
        print(f"Top {num_recommendations} recommendations for User {user_id}:")
        print(recommended_movies)

    return recommended_movies

# Example usage
get_recommendations(user_id=None, num_recommendations=10)

User ID not found, using random user: 102118
Top 10 recommendations for User 102118:
                              movie_id                  genres
43          kissing jessica stein 2001                  Comedy
65                   mediterraneo 1991    Comedy, Romance, War
99           vanya on 42nd street 1994          Drama, Romance
105                    the bridge 2006             Documentary
202  sunrise a song of two humans 1927          Drama, Romance
285               dead mans shoes 2004  Drama, Thriller, Crime
286             vampire in venice 1988                  Horror
299                   duck season 2004  Drama, Comedy, Foreign
385               the chaos class 1975           Comedy, Drama
405                 people i know 2002         Drama, Thriller


Unnamed: 0,movie_id,genres
43,kissing jessica stein 2001,Comedy
65,mediterraneo 1991,"Comedy, Romance, War"
99,vanya on 42nd street 1994,"Drama, Romance"
105,the bridge 2006,Documentary
202,sunrise a song of two humans 1927,"Drama, Romance"
285,dead mans shoes 2004,"Drama, Thriller, Crime"
286,vampire in venice 1988,Horror
299,duck season 2004,"Drama, Comedy, Foreign"
385,the chaos class 1975,"Comedy, Drama"
405,people i know 2002,"Drama, Thriller"
