In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading Datasets all 4

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')


In [3]:
print(movies)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
87580   292731           The Monroy Affaire (2022)   
87581   292737          Shelter in Solitude (2023)   
87582   292753                         Orca (2023)   
87583   292755              The Angry Breed (1968)   
87584   292757           Race to the Summit (2023)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

### Checking for NULL values

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 MB


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,32000200.0,32000200.0,32000200.0,32000200.0
mean,100278.5,29318.61,3.540396,1275241000.0
std,57949.05,50958.16,1.058986,256163000.0
min,1.0,1.0,0.5,789652000.0
25%,50053.0,1233.0,3.0,1051012000.0
50%,100297.0,3452.0,3.5,1272622000.0
75%,150451.0,44199.0,4.0,1503158000.0
max,200948.0,292757.0,5.0,1697164000.0


In [7]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  87585 non-null  int64  
 1   imdbId   87585 non-null  int64  
 2   tmdbId   87461 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.0 MB


Checking the missing values

In [8]:
print(ratings.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [9]:
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.rename(columns={'rating': 'avg_rating'}, inplace=True)

In [10]:
avg_ratings

Unnamed: 0,movieId,avg_rating
0,1,3.897438
1,2,3.275758
2,3,3.139447
3,4,2.845331
4,5,3.059602
...,...,...
84427,292731,4.000000
84428,292737,1.500000
84429,292753,4.000000
84430,292755,1.000000


## Adding respective Imdb URL's to the movies in the dataset

In [11]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'],unit = 's')

In [12]:
links['imdb_url'] = 'https://www.imdb.com/title/tt' + links['imdbId'].astype(str).str.zfill(7) + '/'
movies = movies.merge(links[['movieId', 'imdb_url']], on='movieId', how='left')


In [13]:
from scipy.sparse import coo_matrix

# Create sparse matrix directly from ratings data
rows = ratings['userId'].astype('category').cat.codes
cols = ratings['movieId'].astype('category').cat.codes
data = ratings['rating']

sparse_matrix = coo_matrix((data, (rows, cols)))


In [14]:
ratings['userId'] = ratings['userId'].astype('int32')
ratings['movieId'] = ratings['movieId'].astype('int32')
ratings['rating'] = ratings['rating'].astype('float32')


In [15]:
min_ratings = 10
valid_movies = ratings['movieId'].value_counts()[ratings['movieId'].value_counts() > min_ratings].index
valid_users = ratings['userId'].value_counts()[ratings['userId'].value_counts() > min_ratings].index

filtered_ratings = ratings[ratings['movieId'].isin(valid_movies) & ratings['userId'].isin(valid_users)]


In [16]:
print(links.columns)


Index(['movieId', 'imdbId', 'tmdbId', 'imdb_url'], dtype='object')


In [17]:
if 'imdbId' in movies.columns:
    movies = movies.drop(columns=['imdbId'])


In [18]:
movies = movies.merge(links[['movieId', 'imdbId']], on='movieId', how='left')


In [19]:
movies = movies.merge(links[['movieId', 'imdbId']], on='movieId', how='left', suffixes=('', '_links'))

if 'imdbId_links' in movies.columns:
    movies = movies.drop(columns=['imdbId_links'])


In [20]:
import pandas as pd

# Load ratings data
ratings = pd.read_csv('ratings.csv')

# Calculate average ratings for each movieId
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.rename(columns={'rating': 'avg_rating'}, inplace=True)


In [21]:
movies = movies.merge(avg_ratings, on='movieId', how='left')


In [22]:
links = pd.read_csv('links.csv')

In [23]:
links

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
87580,292731,26812510,1032473.0
87581,292737,14907358,986674.0
87582,292753,12388280,948139.0
87583,292755,64027,182776.0


In [24]:
movies = pd.read_csv('movies.csv')

In [25]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama
87581,292737,Shelter in Solitude (2023),Comedy|Drama
87582,292753,Orca (2023),Drama
87583,292755,The Angry Breed (1968),Drama


In [26]:
movies = movies.merge(links[['movieId', 'imdbId']], on='movieId', how='left')


In [27]:
movies

Unnamed: 0,movieId,title,genres,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497
2,3,Grumpier Old Men (1995),Comedy|Romance,113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885
4,5,Father of the Bride Part II (1995),Comedy,113041
...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,26812510
87581,292737,Shelter in Solitude (2023),Comedy|Drama,14907358
87582,292753,Orca (2023),Drama,12388280
87583,292755,The Angry Breed (1968),Drama,64027


In [28]:
movies['imdb_url'] = 'https://www.imdb.com/title/tt' + movies['imdbId'].astype(str).str.zfill(7) + '/'
print(movies[['movieId', 'title', 'imdb_url']].head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                imdb_url  
0  https://www.imdb.com/title/tt0114709/  
1  https://www.imdb.com/title/tt0113497/  
2  https://www.imdb.com/title/tt0113228/  
3  https://www.imdb.com/title/tt0114885/  
4  https://www.imdb.com/title/tt0113041/  


In [29]:
fd = pd.read_csv('movies_finalized_dataset.csv')

In [30]:
fd

Unnamed: 0,movieId,title,genres,imdbId,imdb_url,avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://www.imdb.com/title/tt0114709/,3.897438
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://www.imdb.com/title/tt0113497/,3.275758
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://www.imdb.com/title/tt0113228/,3.139447
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://www.imdb.com/title/tt0114885/,2.845331
4,5,Father of the Bride Part II (1995),Comedy,113041,https://www.imdb.com/title/tt0113041/,3.059602
...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,26812510,https://www.imdb.com/title/tt26812510/,4.000000
87581,292737,Shelter in Solitude (2023),Comedy|Drama,14907358,https://www.imdb.com/title/tt14907358/,1.500000
87582,292753,Orca (2023),Drama,12388280,https://www.imdb.com/title/tt12388280/,4.000000
87583,292755,The Angry Breed (1968),Drama,64027,https://www.imdb.com/title/tt0064027/,1.000000


### Visualizing the data in form of Histogram

In [35]:
genre_counts_top = genre_counts.head(30)  # Limit to top 30 to avoid clutter

x = np.arange(len(genre_counts_top))
fig, ax = plt.subplots(figsize=(12, 6))

bars = ax.bar(x, genre_counts_top.values, edgecolor='white', linewidth=0.7)

ax.bar_label(bars, padding=3)

ax.set_xticks(x)
ax.set_xticklabels(genre_counts_top.index, rotation=45, ha='right')
ax.set_ylabel('Number of Movies')
ax.set_title('Number of Movies by Top 30 Genres')

plt.tight_layout()

# Save to file instead of showing interactively
plt.savefig('movies_by_genre.png')
plt.close()