# Ex1: Most Similar Item - Distance Based

In [1]:
import pandas as pd

data_url = 'https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv'
cars = pd.read_csv(data_url)
cars.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
print(cars.shape)
cars.head()

(32, 12)


Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Step 1: define the features

In [2]:
from sklearn.preprocessing import StandardScaler

# Extract only some subset of columns to reduce computation time 
X = cars[['mpg', 'disp', 'hp', 'wt']].values

# Standardize the features so that no feature dominates the
# distance computations due to unit scale
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## Step 2: define a distance metric

In [3]:
from sklearn.metrics.pairwise import euclidean_distances

## Step 3: Recommend items

- If we're looking at a certain item, lets recommend the most similar car!

In [4]:
# Car the user is looking at
looking_at_car = [15, 300, 160, 3.2] 
looking_at_car = scaler.transform([looking_at_car])

# Distance from all other cars
distances = euclidean_distances(X, looking_at_car)
distances = distances.reshape(-1)   # Before it was (n_cars, 1)

# Find the 3 indices with the minimum distance (highest similarity) to the car we're looking at
ordered_indices = distances.argsort() # This will help with the assignment!
closest_indices = ordered_indices[:3]

# Get the cars for these indices
closest_cars = cars.iloc[closest_indices]
closest_cars

Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
22,AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
21,Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2
13,Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3


# Ex2 Ratings Based - Supervized

In a system where a user has rated some items, can we recommend the items they haven't rated yet that we think they will rate highly?

In [5]:
# URL for the MovieLens 100K dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"

# Define column names for the dataset
columns = ["user_id", "item_id", "rating", "timestamp"]

# Read the dataset directly from the URL into a Pandas DataFrame
df = pd.read_csv(url, sep="\t", names=columns)
df = df.drop('timestamp', axis = 1)
df.shape

(100000, 3)

In [6]:
df.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [7]:
movies_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies_columns = ["movie_id", "title", "release_date", "video_release_date",
                  "imdb_url", "unknown", "Action", "Adventure", "Animation", 
                  "Children's", "Comedy", "Crime", "Documentary", "Drama", 
                  "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
                  "Romance", "Sci-Fi", "Thriller", "War", "Western"]

movies_df = pd.read_csv(movies_url, sep="|", names=movies_columns, encoding="latin-1")
movies_df = movies_df.drop(columns = ["release_date", "video_release_date",
                  "imdb_url"], axis = 1)
print(movies_df.shape)
movies_df.head()

(1682, 21)


Unnamed: 0,movie_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
merged_df = pd.merge(df, movies_df, left_on="item_id", right_on="movie_id")
merged_df.shape

(100000, 24)

In [9]:
user1_df = merged_df[merged_df["user_id"] == 1]
user1_df.shape

(272, 24)

In [10]:
genres = user1_df.columns[5:]
X = user1_df[genres]
y = user1_df["rating"]
print(X.shape)
print(y.shape)

(272, 19)
(272,)


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [13]:
# No need to transform this data its already One-hot encoded for us :)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = mean_squared_error(y_test, y_pred)
score = np.sqrt(score)
score

1.4388106173498028

In [14]:
y_test = np.array(y_test).reshape(1,-1)
y_test

array([[3, 4, 4, 3, 3, 4, 4, 5, 3, 1, 1, 1, 3, 5]])

In [15]:
y_pred

array([3.99644776, 3.96189018, 4.35736111, 3.45650742, 3.99644776,
       1.64783333, 4.35736111, 1.64783333, 4.06185462, 3.66386519,
       1.99543333, 1.31      , 3.66890096, 4.94046911])

In [16]:
y_test = y_test.reshape(-1,1)
y_pred = y_pred.reshape(-1,1)
results = pd.DataFrame(np.hstack((y_pred,y_test)), columns = ["Pred", "Actual"])
results["Error"] = np.abs(results["Pred"] - results["Actual"])

In [17]:
results

Unnamed: 0,Pred,Actual,Error
0,3.996448,3.0,0.996448
1,3.96189,4.0,0.03811
2,4.357361,4.0,0.357361
3,3.456507,3.0,0.456507
4,3.996448,3.0,0.996448
5,1.647833,4.0,2.352167
6,4.357361,4.0,0.357361
7,1.647833,5.0,3.352167
8,4.061855,3.0,1.061855
9,3.663865,1.0,2.663865


In [18]:
merged_df.head()

Unnamed: 0,user_id,item_id,rating,movie_id,title,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,242,Kolya (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
merged_df.shape

(100000, 24)

In [20]:
seen_movies = merged_df[merged_df['user_id']==1]['item_id']

In [23]:
unseen_df = merged_df[~merged_df['item_id'].isin(seen_movies)]

In [24]:
unseen_df

Unnamed: 0,user_id,item_id,rating,movie_id,title,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
117,186,302,3,302,L.A. Confidential (1997),0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
118,191,302,4,302,L.A. Confidential (1997),0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
119,49,302,4,302,L.A. Confidential (1997),0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
120,54,302,4,302,L.A. Confidential (1997),0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
121,62,302,3,302,L.A. Confidential (1997),0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,840,1674,4,1674,Mamma Roma (1962),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,655,1640,3,1640,"Eighth Day, The (1996)",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,655,1637,3,1637,Girls Town (1996),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,655,1630,3,1630,"Silence of the Palace, The (Saimt el Qusur) (1...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
unseen_df = unseen_df.groupby("item_id").first().reset_index()

In [28]:
unseen_df

Unnamed: 0,item_id,user_id,rating,movie_id,title,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,273,213,5,273,Heat (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,274,194,2,274,Sabrina (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,275,145,2,275,Sense and Sensibility (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,276,178,3,276,Leaving Las Vegas (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,277,63,4,277,Restoration (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405,1678,863,1,1678,Mat' i syn (1997),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1406,1679,863,3,1679,B. Monkey (1998),0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1407,1680,863,2,1680,Sliding Doors (1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1408,1681,896,3,1681,You So Crazy (1994),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
unseen_genres = unseen_df.columns[5:]
unseen_X = unseen_df[unseen_genres]
print(unseen_X.shape)

(1410, 19)


In [30]:
ratings = model.predict(unseen_X)

In [31]:
ratings

array([3.5346075 , 3.96189018, 3.57105152, ..., 3.57105152, 3.66386519,
       4.2006788 ])

In [32]:
ratings.argmax()

304

In [33]:
ratings[304]

4.940469107551488

In [34]:
unseen_df.iloc[304,:]

item_id                     577
user_id                     299
rating                        3
movie_id                    577
title          Coneheads (1993)
unknown                       0
Action                        0
Adventure                     0
Animation                     0
Children's                    0
Comedy                        1
Crime                         0
Documentary                   0
Drama                         0
Fantasy                       0
Film-Noir                     0
Horror                        0
Musical                       0
Mystery                       0
Romance                       0
Sci-Fi                        1
Thriller                      0
War                           0
Western                       0
Name: 304, dtype: object