# Movie Recommendation system 

Dataset used: [ml-latest-small](https://grouplens.org/datasets/movielens/latest/)

## Data Exploration

In [2]:
# imports
import pandas as pd

### movies.csv

In [13]:
df = pd.read_csv("data/ml-latest-small/movies.csv")

df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
df_no_genre = df[df["genres"] == "(no genres listed)"]

len(df_no_genre)

34

In [5]:
df.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [6]:
df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

### Ratings.csv

In [7]:
df = pd.read_csv("data/ml-latest-small/ratings.csv")

df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [9]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### tags.csv

In [10]:
df = pd.read_csv("data/ml-latest-small/tags.csv")

df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [11]:
df.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683.0,3683.0,3683.0
mean,431.149335,27252.013576,1320032000.0
std,158.472553,43490.558803,172102500.0
min,2.0,1.0,1137179000.0
25%,424.0,1262.5,1137521000.0
50%,474.0,4454.0,1269833000.0
75%,477.0,39263.0,1498457000.0
max,610.0,193565.0,1537099000.0


### links.csv

In [18]:
df = pd.read_csv("data/ml-latest-small/links.csv")

df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## Preprocessing (non-model specific)

In [1]:
# imports
import pandas as pd

### movies.csv

In [2]:
df = pd.read_csv("data/ml-latest-small/movies.csv")

df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Creating the "year" feature

The name of each movie includes it's publishing year, we may extract this (as an integer) to its seperate column

we then further bucket the years into 1 hot features (according to era)

In [3]:
import re

df['year'] = df['title'].str.extract(r'\((\d{4})\)', expand=False)
df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')

df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [4]:
df_decade = (df['year'] // 10) * 10

df_decade.value_counts()

year
2000    2849
1990    2212
2010    1931
1980    1177
1970     500
1960     401
1950     279
1940     197
1930     136
1920      37
1910       7
1900       3
Name: count, dtype: Int64

We will bucket the screening years in the following categories:
- \>1980 
- 1980 - 1989
- Every 5 years from 1990 to 2014 (1990 - 1994, 1995 - 1999, etc)
- \<2015

In [5]:
def get_year_group(year):
    if year >= 1990:
        # Group by 5 years until 2015
        if year >= 2015:
            return '2015+'
        base_year = (year // 5) * 5
        return f'{base_year}-{base_year+4}'
    elif year >= 1980:
        return '1980s'
    else:
        return 'pre-1980'

In [6]:
# Create year group features using one-hot encoding
df['year_group'] = df['year'].apply(get_year_group)
year_group_dummies = pd.get_dummies(df['year_group'], prefix='year').astype(int)
df = pd.concat([df, year_group_dummies], axis=1)

df.head()

Unnamed: 0,movieId,title,genres,year,year_group,year_1980s,year_1990.0-1994.0,year_1995.0-1999.0,year_2000.0-2004.0,year_2005.0-2009.0,year_2010.0-2014.0,year_2015+,year_pre-1980
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,1995.0-1999.0,0,0,1,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,1995.0-1999.0,0,0,1,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,1995.0-1999.0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,1995.0-1999.0,0,0,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,1995,1995.0-1999.0,0,0,1,0,0,0,0,0


#### Convert genres into "one hot" features

Each movie entry can have multiple genres, which are all contained in a single string, seperated by the "|" symbol

We will now create "one-hot" features by each genre.

For movies with the genre "(no genres listed)", it would have value 0 for all genre features.

In [7]:
genres = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

def genre_to_onehot(row):
    movie_genres = row['genres'].split('|')
    for genre in movie_genres:
        if genre in genres:
            row[genre] = 1
    return row


In [8]:
for genre in genres:
    df[genre] = 0
    
df = df.apply(genre_to_onehot, axis=1)

df.head()

Unnamed: 0,movieId,title,genres,year,year_group,year_1980s,year_1990.0-1994.0,year_1995.0-1999.0,year_2000.0-2004.0,year_2005.0-2009.0,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,1995.0-1999.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,1995.0-1999.0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,1995.0-1999.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,1995.0-1999.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,1995,1995.0-1999.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Dropping redundant columns:
redundant_columns = ["genres", "year", "year_group"]
df = df.drop(redundant_columns, axis = 1)

df.head()

Unnamed: 0,movieId,title,year_1980s,year_1990.0-1994.0,year_1995.0-1999.0,year_2000.0-2004.0,year_2005.0-2009.0,year_2010.0-2014.0,year_2015+,year_pre-1980,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Storing the modified csv

In [10]:
df.to_csv("data/ml-latest-small-preprocessed/movies.csv", index = False)

### ratings.csv

Given the information "Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars)." We will scale the ratings to [0,1] using this information.

We will also determine relevance according to the relative rankings for each user. We can choose an arbiraty threshold (0.7 in this case), and then mark all movies ranked above this threshold (and tied with those within the threshold) as relevant.

Note that this does not account for the user's rating bias. This will be addressed in the model-training/ inference stage.

In [17]:
df = pd.read_csv("data/ml-latest-small/ratings.csv")

df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### Scaling to range


In [18]:
df["rating"] = df["rating"].apply(lambda x: (x-0.5)/4.5)

df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,0.777778,964982703
1,1,3,0.777778,964981247
2,1,6,0.777778,964982224
3,1,47,1.0,964983815
4,1,50,1.0,964982931


#### Generating relevance tags

In [19]:
df["percentile"] = df["rating"].rank(pct = True, method = "min")

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,percentile
0,1,1,0.777778,964982703,0.518238
1,1,3,0.777778,964981247,0.518238
2,1,6,0.777778,964982224,0.518238
3,1,47,1.0,964983815,0.868995
4,1,50,1.0,964982931,0.868995


In [20]:
threshold = 0.7

df["relevant"] = df["percentile"] > threshold

df["relevant"] = df["relevant"].astype(int)

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,percentile,relevant
0,1,1,0.777778,964982703,0.518238,0
1,1,3,0.777778,964981247,0.518238,0
2,1,6,0.777778,964982224,0.518238,0
3,1,47,1.0,964983815,0.868995,1
4,1,50,1.0,964982931,0.868995,1


#### Storing the modified csv

In [21]:
df.to_csv("data/ml-latest-small-preprocessed/ratings.csv", index = False)

### tags.csv

All the tags are non-standardized, it would be hard to group them or work with them directly.

We will instead aim to capture the sementic meaning of each tag, and do so by embedding them using an embedding model

The model we will be using is [dunzhang/stella_en_400M_v5](https://huggingface.co/dunzhang/stella_en_400M_v5) (via `sentence_transformers`)

In [12]:
df = pd.read_csv("data/ml-latest-small/tags.csv")

df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

# Loading the model
model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True)

  from tqdm.autonotebook import tqdm, trange
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

In [9]:
def encode(x):
    return model.encode(x, normalize_embeddings=True)

In [10]:
# Applying the codes
df['tag_embedding'] = df['tag'].apply(encode)

df.head()

Unnamed: 0,userId,movieId,tag,timestamp,tag_embedding
0,2,60756,funny,1445714994,"[0.029487895, 0.034716193, -0.07196176, -0.003..."
1,2,60756,Highly quotable,1445714996,"[0.025856726, -0.0034611484, -0.08370604, -0.0..."
2,2,60756,will ferrell,1445714992,"[0.02093276, 0.016063133, -0.036068276, 0.0077..."
3,2,89774,Boxing story,1445715207,"[0.03893604, -0.015445891, -0.11716878, -0.006..."
4,2,89774,MMA,1445715200,"[0.03879173, 0.0026901236, -0.0891117, 0.01044..."


In [11]:
# Saving the csv
df.to_csv("data/ml-latest-small-preprocessed/tags.csv",index = False)

### links.csv

We do not want to use the links.csv

In [13]:
df = pd.read_csv("data/ml-latest-small/links.csv")

df.to_csv("data/ml-latest-small-preprocessed/links.csv", index = False)