In [1]:
import pandas as pd

# URL for the MovieLens 100K dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"

# Define column names for the dataset
columns = ["user_id", "item_id", "rating", "timestamp"]

# Read the dataset directly from the URL into a Pandas DataFrame
df = pd.read_csv(url, sep="\t", names=columns)
df = df.drop('timestamp', axis = 1)
df.shape

(100000, 3)

In [2]:
movies_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies_columns = ["movie_id", "title", "release_date", "video_release_date",
                  "imdb_url", "unknown", "Action", "Adventure", "Animation", 
                  "Children's", "Comedy", "Crime", "Documentary", "Drama", 
                  "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
                  "Romance", "Sci-Fi", "Thriller", "War", "Western"]

movies_df = pd.read_csv(movies_url, sep="|", names=movies_columns, encoding="latin-1")
movies_df = movies_df.drop(columns = ["release_date", "video_release_date",
                  "imdb_url"], axis = 1)
print(movies_df.shape)

(1682, 21)


In [3]:
merged_df = pd.merge(df, movies_df, left_on="item_id", right_on="movie_id")
merged_df.shape

(100000, 24)

In [4]:
user1_df = merged_df[merged_df["user_id"] == 1]
user1_df.shape

(272, 24)

In [5]:
genres = user1_df.columns[5:]
X = user1_df[genres]
y = user1_df["rating"]
print(X.shape)
print(y.shape)

(272, 19)
(272,)


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [7]:
# No need to transform this data its already One-hot encoded for us :)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = mean_squared_error(y_test, y_pred)
score = np.sqrt

In [8]:
y_test = np.array(y_test).reshape(1,-1)
y_test

array([[3, 4, 4, 3, 3, 4, 4, 5, 3, 1, 1, 1, 3, 5]])

In [9]:
y_pred

array([3.81309477, 3.88407476, 4.42843651, 3.29885386, 3.81309477,
       1.69390476, 4.42843651, 1.69390476, 4.03106559, 3.66809337,
       2.327     , 1.19      , 3.65788185, 4.94763226])

In [10]:

y_test = y_test.reshape(-1,1)
y_pred = y_pred.reshape(-1,1)
results = pd.DataFrame(np.hstack((y_pred,y_test)), columns = ["Pred", "Actual"])
results["Error"] = np.abs(results["Pred"] - results["Actual"])

In [11]:
results

Unnamed: 0,Pred,Actual,Error
0,3.813095,3.0,0.813095
1,3.884075,4.0,0.115925
2,4.428437,4.0,0.428437
3,3.298854,3.0,0.298854
4,3.813095,3.0,0.813095
5,1.693905,4.0,2.306095
6,4.428437,4.0,0.428437
7,1.693905,5.0,3.306095
8,4.031066,3.0,1.031066
9,3.668093,1.0,2.668093


# START HERE

# Recommend a Movie User Hasn't Seen:

In [12]:
merged_df.shape

(100000, 24)

In [13]:
merged_df.head()

Unnamed: 0,user_id,item_id,rating,movie_id,title,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# find movies they've seen
seen_movies = merged_df[merged_df['user_id'] == 1]['item_id']

# remove them
unseen_df = merged_df[~merged_df['item_id'].isin(seen_movies)]

In [15]:
unseen_df.shape

(58166, 24)

In [16]:
# pull first result just to have a list of movies
unseen_df = unseen_df.groupby("item_id").first().reset_index()
unseen_df.shape

(1410, 24)

In [17]:
# extract the features
unseen_genres = unseen_df.columns[5:]
unseen_X = unseen_df[genres]
print(unseen_X.shape)

(1410, 19)


In [18]:
# predict based on these features, find the max
ratings = model.predict(unseen_X)
ratings.argmax()

304

In [19]:
ratings[304]

4.94763225530092

In [20]:
unseen_df.iloc[304,:]
# ROTTEN TOMATOES THIS

item_id                     577
user_id                     299
rating                        3
movie_id                    577
title          Coneheads (1993)
unknown                       0
Action                        0
Adventure                     0
Animation                     0
Children's                    0
Comedy                        1
Crime                         0
Documentary                   0
Drama                         0
Fantasy                       0
Film-Noir                     0
Horror                        0
Musical                       0
Mystery                       0
Romance                       0
Sci-Fi                        1
Thriller                      0
War                           0
Western                       0
Name: 304, dtype: object

In [21]:
ratings.sort()
ratings 

array([1.115     , 1.115     , 1.115     , ..., 4.94763226, 4.94763226,
       4.94763226])

# Make this better by accounting for quality?

- Use general consensus on a movie as a stand in for quality
- Simpler version of collaborative filtering - using other people's ratings to guide our recommendation
- Collaborative does a better job -- consensus on a movie by SIMILAR users 

In [22]:
# extract average ratings
avg_ratings = merged_df.groupby("item_id").mean().reset_index()
avg_ratings = avg_ratings[["item_id", "rating"]]
avg_ratings.rename(columns = {"rating":"avg_rating"}, inplace = True)

  avg_ratings = merged_df.groupby("item_id").mean().reset_index()


In [23]:
avg_ratings

Unnamed: 0,item_id,avg_rating
0,1,3.878319
1,2,3.206107
2,3,3.033333
3,4,3.550239
4,5,3.302326
...,...,...
1677,1678,1.000000
1678,1679,3.000000
1679,1680,2.000000
1680,1681,3.000000


In [24]:
# merge it with our original dataset for our user
user1_df_ratings = user1_df.merge(avg_ratings, on = "item_id")

In [25]:
user1_df_ratings.shape

(272, 25)

In [26]:
user1_df.shape

(272, 24)

In [42]:
# Need to retrain our model to include this info
# same as code from above
features = user1_df_ratings.columns[5:]
X = user1_df_ratings[features]
y = user1_df_ratings["rating"]
print(X.shape)
print(y.shape)

(272, 20)
(272,)


In [28]:
# copt the code from earlier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [29]:
# copy the code from earlier

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = mean_squared_error(y_test, y_pred)
score = np.sqrt(score)
score

0.7575571642739953

In [30]:
# copy from earlier
y_test = np.array(y_test).reshape(1,-1)
y_test

array([[3, 4, 4, 3, 3, 4, 4, 5, 3, 1, 1, 1, 3, 5]])

In [31]:
# copy from earlier
y_test = y_test.reshape(-1,1)
y_pred = y_pred.reshape(-1,1)
results = pd.DataFrame(np.hstack((y_pred,y_test)), columns = ["Pred", "Actual"])
results["Error"] = np.abs(results["Pred"] - results["Actual"])

In [32]:
results
# alot better

Unnamed: 0,Pred,Actual,Error
0,3.07,3.0,0.07
1,4.4,4.0,0.4
2,2.85,4.0,1.15
3,4.39,3.0,1.39
4,3.41,3.0,0.41
5,4.62,4.0,0.62
6,2.51,4.0,1.49
7,4.13,5.0,0.87
8,3.31,3.0,0.31
9,1.64,1.0,0.64


# Generate some better recommendations

In [33]:
unseen_df

Unnamed: 0,item_id,user_id,rating,movie_id,title,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,273,213,5,273,Heat (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,274,194,2,274,Sabrina (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,275,145,2,275,Sense and Sensibility (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,276,178,3,276,Leaving Las Vegas (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,277,63,4,277,Restoration (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405,1678,863,1,1678,Mat' i syn (1997),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1406,1679,863,3,1679,B. Monkey (1998),0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1407,1680,863,2,1680,Sliding Doors (1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1408,1681,896,3,1681,You So Crazy (1994),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# merge average ratings and extract features
unseen_df = unseen_df.merge(avg_ratings, on = "item_id")
unseen_features = unseen_df.columns[5:]
unseen_X = unseen_df[unseen_features]
print(unseen_X.shape)

(1410, 20)


In [35]:
# predict 
ratings = model.predict(unseen_X)

In [36]:
ratings

array([3.45, 4.54, 4.41, ..., 2.  , 2.94, 2.3 ])

In [37]:
#Pull top 5

top5_indices = ratings.argsort()[-5:][::-1]
top5_indices

array([ 541, 1321,  928,  254,  359])

In [38]:
print(ratings[1321])
unseen_df.iloc[1321]

5.0


item_id                  1594
user_id                   532
rating                      4
movie_id                 1594
title          Everest (1998)
unknown                     0
Action                      0
Adventure                   0
Animation                   0
Children's                  0
Comedy                      0
Crime                       0
Documentary                 1
Drama                       0
Fantasy                     0
Film-Noir                   0
Horror                      0
Musical                     0
Mystery                     0
Romance                     0
Sci-Fi                      0
Thriller                    0
War                         0
Western                     0
avg_rating                4.5
Name: 1321, dtype: object

In [39]:
print(ratings[928])
unseen_df.iloc[928]

5.0


item_id                                              1201
user_id                                                90
rating                                                  5
movie_id                                             1201
title          Marlene Dietrich: Shadow and Light (1996) 
unknown                                                 0
Action                                                  0
Adventure                                               0
Animation                                               0
Children's                                              0
Comedy                                                  0
Crime                                                   0
Documentary                                             1
Drama                                                   0
Fantasy                                                 0
Film-Noir                                               0
Horror                                                  0
Musical       

In [40]:
print(ratings[541])
unseen_df.iloc[541]

5.0


item_id                                  814
user_id                                   13
rating                                     5
movie_id                                 814
title          Great Day in Harlem, A (1994)
unknown                                    0
Action                                     0
Adventure                                  0
Animation                                  0
Children's                                 0
Comedy                                     0
Crime                                      0
Documentary                                1
Drama                                      0
Fantasy                                    0
Film-Noir                                  0
Horror                                     0
Musical                                    0
Mystery                                    0
Romance                                    0
Sci-Fi                                     0
Thriller                                   0
War       

In [41]:
print(ratings[849])
unseen_df.iloc[849]

4.95


item_id                                  1122
user_id                                    60
rating                                      5
movie_id                                 1122
title          They Made Me a Criminal (1939)
unknown                                     0
Action                                      0
Adventure                                   0
Animation                                   0
Children's                                  0
Comedy                                      0
Crime                                       1
Documentary                                 0
Drama                                       1
Fantasy                                     0
Film-Noir                                   0
Horror                                      0
Musical                                     0
Mystery                                     0
Romance                                     0
Sci-Fi                                      0
Thriller                          