# Dependancies:

In [208]:
# # Installing the Levenshtein Library:
# pip install python-Levenshtein

In [209]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
from Levenshtein import distance

# Parsing the Data

In [210]:
# users = pd.read_csv(
#     r'C:\Users\squink\Desktop\Movie Recommendation System Project\data\ml-1m\users.dat',
#     sep='::',  # Specifying the delimiter
#     engine='python',
#     names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'] # Defining column names
# )
# print(users.head())

In [211]:
# ratings = pd.read_csv(
#     r'C:\Users\squink\Desktop\Movie Recommendation System Project\data\ml-1m\ratings.dat',
#     sep='::',
#     engine='python',
#     names=['UserID', 'MovieID', 'Rating', 'Timestamp']
# )
# print(ratings.head())


In [212]:
# movies = pd.read_csv(
#     r'C:\Users\squink\Desktop\Movie Recommendation System Project\data\ml-1m\movies.dat',
#     sep='::',
#     engine='python',
#     names=['MovieID', 'Title', 'Genre'],
#     encoding= 'latin-1'
# )
# print(movies.head())

# Saving to CSV files:

In [213]:
# ratings.to_csv('ratings.csv', index= False);
# movies.to_csv('movies.csv', index= False);
# users.to_csv('users.csv', index= False);

# Converting Into Dataframes:

In [214]:
ratings_df = pd.read_csv(r'C:\Users\squink\Desktop\Movie-Recommendation-System-Project\csv_files\ratings.csv')
movies_df = pd.read_csv(r'C:\Users\squink\Desktop\Movie-Recommendation-System-Project\csv_files\movies.csv')
users_df = pd.read_csv(r'C:\Users\squink\Desktop\Movie-Recommendation-System-Project\csv_files\users.csv')

# Creating the copies we will be working with:
ratings = ratings_df.copy()
movies = movies_df.copy()
users = users_df.copy()

# Data Exploration:

In [215]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [216]:
ratings['Timestamp'].dtype

dtype('int64')

In [217]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [218]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genre    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


# Data Pre-Processing (ratings):

In [219]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


It is good practice, when writing code such as the one below to wrap your transformation in an if statement so that your code doesn't error when you rerun it.

In [220]:
# Converting 'Timestamp' to datetime object if not already converted:
if ratings['Timestamp'].dtype != 'datetime64[ns]':
    ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit= 's', errors= 'coerce')
ratings['Timestamp']

0         2000-12-31 22:12:40
1         2000-12-31 22:35:09
2         2000-12-31 22:32:48
3         2000-12-31 22:04:35
4         2001-01-06 23:38:11
                  ...        
1000204   2000-04-26 02:35:41
1000205   2000-04-25 23:21:27
1000206   2000-04-25 23:19:06
1000207   2000-04-26 02:20:48
1000208   2000-04-26 02:19:29
Name: Timestamp, Length: 1000209, dtype: datetime64[ns]

Most ML algorithms will not be able to automatically process datetime objects therefore we must split it into its components s.t. each one is strictly numeric.

In [221]:
# Extracting the components from our datetime object:
ratings['Year'] = ratings['Timestamp'].dt.year
ratings['Month'] = ratings['Timestamp'].dt.month
ratings['Day'] = ratings['Timestamp'].dt.day
ratings['Hour'] = ratings['Timestamp'].dt.hour
ratings['Minute'] = ratings['Timestamp'].dt.minute
ratings['Second'] = ratings['Timestamp'].dt.second

# Dropping the datetime object column (no longer needed):
ratings = ratings.drop('Timestamp', axis = 1)

In [222]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Minute,Second
0,1,1193,5,2000,12,31,22,12,40
1,1,661,3,2000,12,31,22,35,9
2,1,914,3,2000,12,31,22,32,48
3,1,3408,4,2000,12,31,22,4,35
4,1,2355,5,2001,1,6,23,38,11


Next we will check the df for duplicates as this dataset states that there are duplicates in the data (specifically the movie set but it doesn't hurt to be safe). We will define a function to do this so we don't have to write the same code 3 seperate times - this makes our code cleaner and less repetitive.

In [223]:
def check_duplicates(df):
    ''' 
    Prints the duplicate count for the df being passed in
    '''
    duplicate_count = df.duplicated().sum()
    return duplicate_count
check_duplicates(ratings)

0

All non-null counts seem to match in the info output above but let's explicitly check for null values to make sure. Again, we'll define a function to make our lives easier in the future.

In [224]:
def check_nulls(df):
    ''' 
    Checks the df passed in for nulls
    '''
    null_count = df.isna().sum()
    return null_count
check_nulls(ratings)

UserID     0
MovieID    0
Rating     0
Year       0
Month      0
Day        0
Hour       0
Minute     0
Second     0
dtype: int64

Let's double check that all data types are resolved. Yup, looks good! 

In [225]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 9 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   UserID   1000209 non-null  int64
 1   MovieID  1000209 non-null  int64
 2   Rating   1000209 non-null  int64
 3   Year     1000209 non-null  int32
 4   Month    1000209 non-null  int32
 5   Day      1000209 non-null  int32
 6   Hour     1000209 non-null  int32
 7   Minute   1000209 non-null  int32
 8   Second   1000209 non-null  int32
dtypes: int32(6), int64(3)
memory usage: 45.8 MB


In [226]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Minute,Second
0,1,1193,5,2000,12,31,22,12,40
1,1,661,3,2000,12,31,22,35,9
2,1,914,3,2000,12,31,22,32,48
3,1,3408,4,2000,12,31,22,4,35
4,1,2355,5,2001,1,6,23,38,11


# Data Pre-Processing (users):

In [227]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [228]:
check_nulls(users)

UserID        0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64

In [229]:
check_duplicates(users)

0

In [230]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [231]:
# Binarizing the gender column:
if 'Gender' in users.columns:
    users= (pd.get_dummies(users, columns=['Gender']))*1
    users

In [232]:
users.head()

Unnamed: 0,UserID,Age,Occupation,Zip-code,Gender_F,Gender_M
0,1,1,10,48067,1,0
1,2,56,16,70072,0,1
2,3,25,15,55117,0,1
3,4,45,7,2460,0,1
4,5,25,20,55455,0,1


This dataframe was pretty straightforward to clean. Let's move on to the last one.

In [233]:
# Cleaning the 'Zip-code' column and converting it to type int: 
users['Zip-code'] = users['Zip-code'].str.replace(r'-', '', regex= True)
users['Zip-code'].astype(int)

0       48067
1       70072
2       55117
3        2460
4       55455
        ...  
6035    32603
6036    76006
6037    14706
6038     1060
6039    11106
Name: Zip-code, Length: 6040, dtype: int32

In [234]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Age         6040 non-null   int64 
 2   Occupation  6040 non-null   int64 
 3   Zip-code    6040 non-null   object
 4   Gender_F    6040 non-null   int32 
 5   Gender_M    6040 non-null   int32 
dtypes: int32(2), int64(3), object(1)
memory usage: 236.1+ KB


# Data Pre-Processing (movies):

In [235]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genre    3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [236]:
check_nulls(movies)

MovieID    0
Title      0
Genre      0
dtype: int64

In [237]:
check_duplicates(movies)

0

In [238]:
# Converting movie title to str:
movies['Title'] = movies['Title'].astype(str)

In [239]:
# Extracting the release date from the movie title:
movies['ReleaseYear'] = movies['Title'].str.extract(r'\((\d{4})\)') # captures a group of four digits
movies.head()

Unnamed: 0,MovieID,Title,Genre,ReleaseYear
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


It looks like the release year was extracted successfuly so we'll go ahead and remove it using regex just to keep things tidy.

In [240]:
movies["Title"] = movies["Title"].str.replace(r'\s\(\d{4}\)', '', regex= True)
movies.head()

Unnamed: 0,MovieID,Title,Genre,ReleaseYear
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [241]:
# Splitting the 'Genre' column and one hot encoding genres using mlb:
mlb = MultiLabelBinarizer()
if 'Genre' in movies.columns:
    genre_encoded = pd.DataFrame(mlb.fit_transform(movies['Genre'].str.split('|')),
                                columns = mlb.classes_,
                                index= movies.index)
    # Appending encoded genres back to movie df and dropping original 'Genre' column:
    movies = pd.concat([movies.drop('Genre', axis = 1), genre_encoded], axis = 1)
movies.head()

Unnamed: 0,MovieID,Title,ReleaseYear,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Movie Lookup Table:

We are not going to needing the movie titles for our modeling since we already have 'movieID' and can use this to look up the movie title after, therefore, we will be creating a look up table.

In [242]:
# Creating movie look up table: 
movie_lookup = movies[['MovieID', 'Title']]
movie_lookup

# Dropping the title from movie table:
movies = movies.drop('Title', axis= 1)
movies.head()

Unnamed: 0,MovieID,ReleaseYear,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,1995,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,1995,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


There seems to be some typos in the movie titles. Here are some examples:
- American President, The
- Secret Adventures of Tom Thumb, The
- Air Up There, The

In [243]:
# movie_titles = movies['Title']
# movie_titles.to_csv('movie_titles.csv', index= False)

# Merging the Tables:

In [244]:
merge1 = pd.merge(movies, ratings, on= 'MovieID', how= 'inner')
final_df = pd.merge(merge1, users, on= 'UserID', how= 'inner')
final_df

Unnamed: 0,MovieID,ReleaseYear,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Month,Day,Hour,Minute,Second,Age,Occupation,Zip-code,Gender_F,Gender_M
0,1,1995,0,0,1,1,1,0,0,0,...,1,6,23,37,48,1,10,48067,1,0
1,1,1995,0,0,1,1,1,0,0,0,...,12,31,4,30,8,50,9,55117,1,0
2,1,1995,0,0,1,1,1,0,0,0,...,12,31,3,31,36,25,12,11413,0,1
3,1,1995,0,0,1,1,1,0,0,0,...,12,31,1,25,52,25,17,61614,0,1
4,1,1995,0,0,1,1,1,0,0,0,...,12,31,1,34,34,35,1,95370,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,3952,2000,0,0,0,0,0,0,0,1,...,6,9,7,34,59,25,7,92120,1,0
1000205,3952,2000,0,0,0,0,0,0,0,1,...,4,2,14,52,5,25,1,92120,0,1
1000206,3952,2000,0,0,0,0,0,0,0,1,...,1,24,20,4,16,25,7,60607,0,1
1000207,3952,2000,0,0,0,0,0,0,0,1,...,1,18,21,15,37,35,14,10003,0,1


Let's double check that everything in our df is of numeric format.

In [248]:
final_df.apply(lambda x: pd.to_numeric(x, errors='coerce').notnull().all())

MovieID        True
ReleaseYear    True
Action         True
Adventure      True
Animation      True
Children's     True
Comedy         True
Crime          True
Documentary    True
Drama          True
Fantasy        True
Film-Noir      True
Horror         True
Musical        True
Mystery        True
Romance        True
Sci-Fi         True
Thriller       True
War            True
Western        True
UserID         True
Rating         True
Year           True
Month          True
Day            True
Hour           True
Minute         True
Second         True
Age            True
Occupation     True
Zip-code       True
Gender_F       True
Gender_M       True
dtype: bool

# 1. Collaborative Filtering Approach:
Collaborative filtering is a popular technique for recommendation systems that relies on user-item interactions.

# 2. Content-Based Filtering Approach:

Content-based filtering relies on the features of the items (e.g., genres, title keywords).

# 3. Hybrid Recommendation Systems

Combines collaborative filtering and content-based filtering for better performance.