# Ex1: Most Similar Item - Distance Based

In [1]:
import pandas as pd

data_url = 'https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv'
cars = pd.read_csv(data_url)
cars.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
print(cars.shape)
cars.head()

(32, 12)


Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Step 1: define the features

In [8]:
from sklearn.preprocessing import StandardScaler

# Extract only some subset of columns to reduce computation time 
X = cars[['mpg', 'disp', 'hp', 'wt']].values

# Standardize the features so that no feature dominates the
# distance computations due to unit scale
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## Step 2: define a distance metric

In [9]:
from sklearn.metrics.pairwise import euclidean_distances

## Step 3: Recommend items

- If we're looking at a certain item, lets recommend the most similar car!

In [17]:
# Car the user is looking at
looking_at_car = [15, 300, 160, 3.2] 
looking_at_car = scaler.transform([looking_at_car])

# Distance from all other cars
distances = euclidean_distances(X, looking_at_car)
distances = distances.reshape(-1)   # Before it was (n_cars, 1)

# Find the 3 indices with the minimum distance (highest similarity) to the car we're looking at
ordered_indices = distances.argsort() # This will help with the assignment!
closest_indices = ordered_indices[:3]

# Get the cars for these indices
closest_cars = cars.iloc[closest_indices]
closest_cars

Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
22,AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
21,Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2
13,Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3


# Ex2 Ratings Based - Supervized

In a system where a user has rated some items, can we recommend the items they haven't rated yet that we think they will rate highly?

In [66]:
# URL for the MovieLens 100K dataset
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"

# Define column names for the dataset
columns = ["user_id", "item_id", "rating", "timestamp"]

# Read the dataset directly from the URL into a Pandas DataFrame
df = pd.read_csv(url, sep="\t", names=columns)
df = df.drop('timestamp', axis = 1)
df.shape

(100000, 3)

In [67]:
df.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [68]:
movies_url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies_columns = ["movie_id", "title", "release_date", "video_release_date",
                  "imdb_url", "unknown", "Action", "Adventure", "Animation", 
                  "Children's", "Comedy", "Crime", "Documentary", "Drama", 
                  "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
                  "Romance", "Sci-Fi", "Thriller", "War", "Western"]

movies_df = pd.read_csv(movies_url, sep="|", names=movies_columns, encoding="latin-1")
movies_df = movies_df.drop(columns = ["release_date", "video_release_date",
                  "imdb_url"], axis = 1)
print(movies_df.shape)
movies_df.head()

(1682, 21)


Unnamed: 0,movie_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [69]:
merged_df = pd.merge(df, movies_df, left_on="item_id", right_on="movie_id")
merged_df.shape

(100000, 24)

In [70]:
user1_df = merged_df[merged_df["user_id"] == 1]
user1_df.shape

(272, 24)

In [71]:
genres = user1_df.columns[5:]
X = user1_df[genres]
y = user1_df["rating"]
print(X.shape)
print(y.shape)

(272, 19)
(272,)


In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [74]:
# No need to transform this data its already One-hot encoded for us :)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = mean_squared_error(y_test, y_pred)
score = np.sqrt(score)
score

1.4501652387883914

In [75]:
y_test

10732    3
43023    4
30062    4
46315    3
69919    3
49276    4
74062    4
16744    5
57319    3
89873    1
66940    1
98261    1
82987    3
30993    5
Name: rating, dtype: int64

In [76]:
y_pred

array([4.10618702, 3.89744827, 4.43009921, 3.72905031, 4.10618702,
       1.71616667, 4.43009921, 1.71616667, 4.01361127, 3.70394755,
       2.12366667, 1.21191489, 3.6540267 , 4.90552281])

In [64]:
# This was only genre, for a very limited dataset
# This doesn't capture "Quality" - there's good and bad movies in each genre
# But genre might give us a start if someone really doesn't like or really likes a certain genre.
# Combine this with out features/ other approaches and we're on our way!

# RECOMMEND A MOVIE?