In [18]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

The first table is user ratings. The columns are user_id, item_id, rating, and timestamp. <br>
Ratings range from 1-5, and the timestamp is probably in that 'seconds since the epoch' format. <br>
Timestamp could be interesting if we're looking at how a users preferences are trending. 

In [19]:
user_ratings = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.data', 
                           sep = '\t', 
                           header=None,
                           names = ['user_id', 'item_id', 'rating', 'timestamp'])
user_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


Next is the movie table. The meat here is the one-hot-encoded genre tags and release dates. <br>
We need a different encoding to handle some of the movie names, and need to specify the video release date type or else the csv parser will throw warnings. <br>

In [20]:
movie_metadata = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.item',   
                            sep='|',
                            header=None,
                            names = ['item_id', 'item_name', 'release_date', 'video_release_date',
                                     'imdb_link','unknown', 'action', 'adventure', 'animation', 'children', 
                                     'comedy', 'crime', 'documentary', 'drama', 'fantasy', 
                                     'film_noir', 'horror', 'musical', 'mystery', 'romance', 
                                     'sci_fi', 'thriller', 'war', 'western'],
                            dtype={'video_release_date':'str'},
                            encoding='latin-1')
movie_metadata

Unnamed: 0,item_id,item_name,release_date,video_release_date,imdb_link,unknown,action,adventure,animation,children,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


User table. We have the occupation and the zip code here, which is pretty cool - this will allow us to find connections between class and location, and the types of movies people like. <br>
Location data also gives us the opportunity to create cloropleth graphs.

In [21]:
user_info = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.user', 
                        sep='|',
                        header=None,
                        names = ['user_id', 'age', 'gender', 'occupation', 'zip code'])
user_info

Unnamed: 0,user_id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [None]:
# 1. Load data into the pipeline

df = user_ratings

# 2. Split data into training and test sets in order to avoid data leakage
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 3. Train SVD only on training data
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
trainset = train_data.build_full_trainset()

svd = SVD(n_factors=50, n_epochs=20, random_state=42)
svd.fit(trainset)

# 4. Extract SVD features
def get_svd_features(user_id, item_id, svd_model, trainset):
    """Extract SVD latent factors as features"""
    try:
        user_inner = trainset.to_inner_uid(user_id)
        item_inner = trainset.to_inner_iid(item_id)
        
        user_factors = svd_model.pu[user_inner]
        item_factors = svd_model.qi[item_inner]
        
        # Combine features
        features = np.concatenate([
            user_factors,
            item_factors,
            user_factors * item_factors,  # Element-wise interaction
            [svd_model.predict(user_id, item_id).est]  # SVD prediction
        ])
        return features
    except:
        return None

# 5. Create features for training set
X_train = []
y_train = []

for _, row in train_df.iterrows():
    features = get_svd_features(row['user_id'], row['item_id'], svd, trainset)
    if features is not None:
        X_train.append(features)
        y_train.append(row['rating'])

X_train = np.array(X_train)
y_train = np.array(y_train)

# 6. Create features for test set
X_test = []
y_test = []

for _, row in test_df.iterrows():
    features = get_svd_features(row['user_id'], row['item_id'], svd, trainset)
    if features is not None:
        X_test.append(features)
        y_test.append(row['rating'])

X_test = np.array(X_test)
y_test = np.array(y_test)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# 7. Train XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)

xgb_model.fit(X_train, y_train)

# 8. Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
y_pred_xgb = np.clip(y_pred_xgb, 1, 5)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print(f"\nSVD + XGBoost Performance:")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAE: {mae_xgb:.4f}")

# 9. Evaluate pure SVD on same test set
y_pred_svd = [svd.predict(test_df.iloc[i]['user_id'], 
                          test_df.iloc[i]['item_id']).est 
              for i in range(len(X_test))]
y_pred_svd = np.array(y_pred_svd)

rmse_svd = np.sqrt(mean_squared_error(y_test, y_pred_svd))
mae_svd = mean_absolute_error(y_test, y_pred_svd)

print(f"\nPure SVD Performance:")
print(f"RMSE: {rmse_svd:.4f}")
print(f"MAE: {mae_svd:.4f}")

print(f"\nImprovement:")
print(f"RMSE reduction: {(rmse_svd - rmse_xgb):.4f} ({((rmse_svd - rmse_xgb) / rmse_svd * 100):.2f}%)")
print(f"MAE reduction: {(mae_svd - mae_xgb):.4f} ({((mae_svd - mae_xgb) / mae_svd * 100):.2f}%)")

# 10. Feature importance
feature_names = (
    [f'user_factor_{i}' for i in range(50)] +
    [f'item_factor_{i}' for i in range(50)] +
    [f'interaction_{i}' for i in range(50)] +
    ['svd_prediction']
)

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(importance_df.head(10))

# 11. Prediction examples
print("\nSample predictions (first 5 test cases):")
print("User\tItem\tActual\tSVD\tXGB\tError(SVD)\tError(XGB)")
for i in range(min(5, len(X_test))):
    user = test_df.iloc[i]['user_id']
    item = test_df.iloc[i]['item_id']
    actual = y_test[i]
    svd_pred = y_pred_svd[i]
    xgb_pred = y_pred_xgb[i]
    print(f"{user}\t{item}\t{actual}\t{svd_pred:.2f}\t{xgb_pred:.2f}\t{abs(actual-svd_pred):.2f}\t\t{abs(actual-xgb_pred):.2f}")


Training samples: 80000
Test samples: 19969

SVD + XGBoost Performance:
RMSE: 0.9775
MAE: 0.7650

Pure SVD Performance:
RMSE: 1.2832
MAE: 1.0269

Improvement:
RMSE reduction: 0.3057 (23.82%)
MAE reduction: 0.2619 (25.50%)

Top 10 most important features:
            feature  importance
150  svd_prediction    0.380465
134  interaction_34    0.009098
129  interaction_29    0.008898
100   interaction_0    0.008751
130  interaction_30    0.008719
118  interaction_18    0.008718
122  interaction_22    0.008544
142  interaction_42    0.008500
102   interaction_2    0.008420
148  interaction_48    0.008363

Sample predictions (first 5 test cases):
User	Item	Actual	SVD	XGB	Error(SVD)	Error(XGB)
877	381	4	3.73	4.10	0.27		0.10
815	602	3	3.44	3.07	0.44		0.07
94	431	4	3.52	3.19	0.48		0.81
416	875	2	2.93	2.29	0.93		0.29
500	182	2	4.23	4.86	2.23		2.86
