In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy

# --- 1. Data Loading and Exploration ---
data = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
item_info = pd.read_csv('u.item', sep='|', encoding='latin-1', header=None)
item_info = item_info[[0, 1]]
item_info.columns = ['item_id', 'movie_title']
user_info = pd.read_csv('u.user', sep='|', header=None)
user_info = user_info[[0, 1]]
user_info.columns = ['user_id', 'age']

data = pd.merge(data, item_info, on='item_id')
data = pd.merge(data, user_info, on='user_id')

print(data.head())
print(data.info())

plt.figure(figsize=(8, 6))
sns.countplot(x='rating', data=data)
plt.title('Distribution of Ratings')
plt.show()

plt.figure(figsize=(8, 6))
data.groupby('movie_title')['rating'].count().sort_values(ascending=False).hist(bins=50)
plt.title('Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Movies')
plt.show()

# --- 2. Data Preprocessing ---
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(dataset, test_size=0.25, random_state=42)

# --- 3. Model Building ---
# KNN
sim_options = {
    'name': 'cosine',
    'user_based': True
}
knn_model = KNNBasic(sim_options=sim_options)
knn_model.fit(trainset)

# SVD
svd_model = SVD()
svd_model.fit(trainset)

# --- 4. Model Evaluation ---
knn_predictions = knn_model.test(testset)
svd_predictions = svd_model.test(testset)

print("KNN RMSE:", accuracy.rmse(knn_predictions))
print("KNN MAE:", accuracy.mae(knn_predictions))

print("SVD RMSE:", accuracy.rmse(svd_predictions))
print("SVD MAE:", accuracy.mae(svd_predictions))

# --- 5. Hyperparameter Tuning ---
# KNN
param_grid_knn = {
    'k': [20, 40, 60],
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'],
        'user_based': [True, False]
    }
}
gs_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=['rmse', 'mae'], cv=3)
gs_knn.fit(dataset)
print("Best KNN RMSE:", gs_knn.best_score['rmse'])
print("Best KNN parameters:", gs_knn.best_params['rmse'])

# SVD
param_grid_svd = {
    'n_factors': [50, 100, 150],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.04, 0.06]
}
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'], cv=3)
gs_svd.fit(dataset)
print("Best SVD RMSE:", gs_svd.best_score['rmse'])
print("Best SVD parameters:", gs_svd.best_params['rmse'])

# --- 6. Making Recommendations ---
best_svd_model = gs_svd.best_estimator['rmse']
trainset = dataset.build_full_trainset()
best_svd_model.fit(trainset)

user_id = '1'
user_movies = data[data['user_id'] == int(user_id)]['item_id'].tolist()
all_movies = data['item_id'].unique()
unseen_movies = [movie for movie in all_movies if movie not in user_movies]

predictions = [best_svd_model.predict(user_id, movie) for movie in unseen_movies]
predictions.sort(key=lambda x: x.est, reverse=True)

top_10_recommendations = [pred.iid for pred in predictions[:10]]
top_10_movie_titles = [item_info[item_info['item_id'] == int(movie_id)]['movie_title'].iloc[0] for movie_id in top_10_recommendations]

print(f"Top 10 movie recommendations for user {user_id}:")
for movie_title in top_10_movie_titles:
    print(movie_title)