**IMPORT** LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

**READ** IN CSV FILES

In [2]:
links_df = pd.read_csv('links.csv')
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')

GENERAL **INFO** ON EVERY DATAFRAME

In [3]:
print('\n LINKS DataFrame Info:')
links_df.info() 

print('\n MOVIES DataFrame Info:')
movies_df.info()

print('\n RATINGS Dataframe Info:')
ratings_df.info()

print('\n TAGS Dataframe Info:')
tags_df.info()


 LINKS DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB

 MOVIES DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB

 RATINGS Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 

LOOK AT **RANDOM SAMPLES** OF DATAFRAMES

In [4]:
def df_randomsamples(sample_size = 5, random_state = None, **dfs):
    '''
    Get side-by-side random samples from multiple DataFrames.
    
    Parameters:
        sample_size (int): Number of rows to sample from each DataFrame. Default = 5.
        random_state (int): Seed for random sampling. Default = None. 
        **dfs (pd.DataFrame): Keyword arguments where the key is the DataFrame name and the value is the DataFrame itself.
    Returns:
        pd.Dataframe: A DataFrame containing side-by-side random samples from each input DataFrame.
    '''
    random_samples = [(df_name, df.sample(n=sample_size, random_state=random_state)) for df_name, df in dfs.items()]
    
    # Display sampled DataFrames side by side
    for df_name, sampled_df in random_samples:
        print(f"DataFrame: {df_name}")
        display(sampled_df)


In [5]:
df_randomsamples(sample_size = 5, random_state = 42, tags_df = tags_df, movies_df = movies_df, ratings_df = ratings_df, links_df = links_df)

DataFrame: tags_df


Unnamed: 0,userId,movieId,tag,timestamp
2023,474,5644,baseball,1138039381
2587,477,4226,twist ending,1256051079
3222,567,112852,unlikely hero,1525285378
1263,474,1028,Disney,1137375815
781,424,1258,atmospheric,1457843344


DataFrame: movies_df


Unnamed: 0,movieId,title,genres
6213,45635,"Notorious Bettie Page, The (2005)",Drama
1056,1373,Star Trek V: The Final Frontier (1989),Action|Sci-Fi
4891,7325,Starsky & Hutch (2004),Action|Comedy|Crime|Thriller
346,389,"Colonel Chabert, Le (1994)",Drama|Romance|War
5353,8920,"Country Girl, The (1954)",Drama


DataFrame: ratings_df


Unnamed: 0,userId,movieId,rating,timestamp
67037,432,77866,4.5,1335139641
42175,288,474,3.0,978465565
93850,599,4351,3.0,1498524542
6187,42,2987,4.0,996262677
12229,75,1610,4.0,1158989841


DataFrame: links_df


Unnamed: 0,movieId,imdbId,tmdbId
6213,45635,404802,15402.0
1056,1373,98382,172.0
4891,7325,335438,9384.0
346,389,109454,41580.0
5353,8920,46874,2438.0


In [6]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### We see that the 'genres' column in the 'movies_df' dataframe contains multiple genres for each movie, separated by the '|' character. 

So we will **split the genres** accordingly. 

In [7]:
# Function that splits the combined genres
def split_genres(genre_string):
    return genre_string.split('|')

In [8]:
# APply the function to the 'genres' column to create a new column with a list of individual genres for each movie
movies_df['individual_genres'] = movies_df['genres'].apply(split_genres)

In [9]:
movies_df.head()

Unnamed: 0,movieId,title,genres,individual_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

binary_encoded_genres = mlb.fit_transform(movies_df['individual_genres'])

binary_encoded_df = pd.DataFrame(binary_encoded_genres, columns = mlb.classes_)

movies_df = pd.concat([movies_df, binary_encoded_df], axis = 1)

movies_df.drop(['genres', 'individual_genres'], axis = 1, inplace = True)



In [11]:
movies_df.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


**MERGE** MOVIE AND RATING DATAFRAMES

In [12]:
merged_df = movies_df.merge(ratings_df, on = 'movieId', how = 'inner')

In [13]:
merged_df

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating,timestamp
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,1,4.0,964982703
1,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,5,4.0,847434962
2,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,7,4.5,1106635946
3,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,15,2.5,1510577970
4,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,17,4.5,1305696483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,184,3.5,1537109545
100833,193585,Flint (2017),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,184,3.5,1537110021
