In [41]:
import pandas as pd
import statistics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [42]:
path = "../datasets/the-movie-datasets"
path = Path(path)

files = list(path.glob('*'))
print("Files found:")
for i, file in enumerate(files):
    print(f"{i}. {file.name}")

Files found:
0. links_small.csv
1. links.csv
2. credits.csv
3. movies_metadata.csv
4. ratings.csv
5. ratings_small.csv
6. keywords.csv


In [43]:
df_meta_data = pd.read_csv(files[3])
print(df_meta_data.columns)
df_meta = df_meta_data.filter(['id','original_title','genres','budget','revenue','vote_average','vote_count'])

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


  df_meta_data = pd.read_csv(files[3])


In [44]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              45466 non-null  object 
 1   original_title  45466 non-null  object 
 2   genres          45466 non-null  object 
 3   budget          45466 non-null  object 
 4   revenue         45460 non-null  float64
 5   vote_average    45460 non-null  float64
 6   vote_count      45460 non-null  float64
dtypes: float64(3), object(4)
memory usage: 2.4+ MB


In [45]:
df_meta.genres = df_meta.genres.apply(eval).apply(lambda x: [genre['name'] for genre in x] if x else pd.NA)
df_meta.genres.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45466 entries, 0 to 45465
Series name: genres
Non-Null Count  Dtype 
--------------  ----- 
43024 non-null  object
dtypes: object(1)
memory usage: 355.3+ KB


In [46]:
df_meta.genres

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45461                 [Drama, Family]
45462                         [Drama]
45463       [Action, Drama, Thriller]
45464                            <NA>
45465                            <NA>
Name: genres, Length: 45466, dtype: object

In [47]:
df_meta = df_meta.dropna(subset=['genres'])
df_meta.genres.info()

<class 'pandas.core.series.Series'>
Index: 43024 entries, 0 to 45463
Series name: genres
Non-Null Count  Dtype 
--------------  ----- 
43024 non-null  object
dtypes: object(1)
memory usage: 672.2+ KB


In [48]:
genres = df_meta.genres.apply(lambda x: x[0]).unique()
genres,genres.shape

(array(['Animation', 'Adventure', 'Romance', 'Comedy', 'Action', 'Family',
        'History', 'Drama', 'Crime', 'Fantasy', 'Science Fiction',
        'Thriller', 'Music', 'Horror', 'Documentary', 'Mystery', 'Western',
        'TV Movie', 'War', 'Foreign', 'Carousel Productions', 'Aniplex',
        'Odyssey Media'], dtype=object),
 (23,))

In [49]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43024 entries, 0 to 45463
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              43024 non-null  object 
 1   original_title  43024 non-null  object 
 2   genres          43024 non-null  object 
 3   budget          43024 non-null  object 
 4   revenue         43018 non-null  float64
 5   vote_average    43018 non-null  float64
 6   vote_count      43018 non-null  float64
dtypes: float64(3), object(4)
memory usage: 2.6+ MB


In [50]:
df_meta[df_meta.revenue.isnull()]




Unnamed: 0,id,original_title,genres,budget,revenue,vote_average,vote_count
19729,82663,Midnight Man,"[Action, Thriller, Drama]",0,,,
19730,1997-08-20,"[{'iso_639_1': 'en', 'name': 'English'}]","[Carousel Productions, Vision View Entertainme...",/ff9qCepilowshEtG2GYWwzt2bs4.jpg,,,
29502,122662,マルドゥック・スクランブル 排気,"[Animation, Science Fiction]",0,,,
29503,2012-09-29,"[{'iso_639_1': 'ja', 'name': '日本語'}]","[Aniplex, GoHands, BROSTA TV, Mardock Scramble...",/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,,,
35586,249260,Avalanche Sharks,"[TV Movie, Action, Horror, Science Fiction]",0,,,
35587,2014-01-01,"[{'iso_639_1': 'en', 'name': 'English'}]","[Odyssey Media, Pulser Productions, Rogue Stat...",/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,,,


In [51]:
df_meta = df_meta.dropna(subset=['revenue', 'vote_average', 'vote_count'])

In [52]:
df_meta.budget = df_meta.budget.astype('float64')
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43018 entries, 0 to 45463
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              43018 non-null  object 
 1   original_title  43018 non-null  object 
 2   genres          43018 non-null  object 
 3   budget          43018 non-null  float64
 4   revenue         43018 non-null  float64
 5   vote_average    43018 non-null  float64
 6   vote_count      43018 non-null  float64
dtypes: float64(4), object(3)
memory usage: 2.6+ MB


In [53]:
df_meta = df_meta.drop(df_meta[(df_meta.budget == 0) | (df_meta.revenue == 0) | (df_meta.vote_average == 0) | (df_meta.vote_count == 0)].index)

In [54]:
df_meta.set_index('id',inplace=True)
df_meta

Unnamed: 0_level_0,original_title,genres,budget,revenue,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
862,Toy Story,"[Animation, Comedy, Family]",30000000.0,373554033.0,7.7,5415.0
8844,Jumanji,"[Adventure, Fantasy, Family]",65000000.0,262797249.0,6.9,2413.0
31357,Waiting to Exhale,"[Comedy, Drama, Romance]",16000000.0,81452156.0,6.1,34.0
949,Heat,"[Action, Crime, Drama, Thriller]",60000000.0,187436818.0,7.7,1886.0
9091,Sudden Death,"[Action, Adventure, Thriller]",35000000.0,64350171.0,5.5,174.0
...,...,...,...,...,...,...
24049,சிவாஜி,"[Action, Comedy, Drama]",12000000.0,19000000.0,6.9,25.0
280422,Все и сразу,"[Crime, Comedy]",750000.0,3.0,6.0,4.0
62757,Dikari,"[Comedy, Drama]",800000.0,1328612.0,5.8,6.0
63281,Про любоff,"[Romance, Drama]",2000000.0,1268793.0,4.0,3.0


In [55]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5364 entries, 862 to 63898
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   original_title  5364 non-null   object 
 1   genres          5364 non-null   object 
 2   budget          5364 non-null   float64
 3   revenue         5364 non-null   float64
 4   vote_average    5364 non-null   float64
 5   vote_count      5364 non-null   float64
dtypes: float64(4), object(2)
memory usage: 293.3+ KB


In [56]:
df_meta[df_meta.genres.apply(lambda x: len(x) == 0)]

Unnamed: 0_level_0,original_title,genres,budget,revenue,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [57]:
df_meta[(df_meta.budget == 0) | (df_meta.revenue == 0) | (df_meta.vote_average == 0) | (df_meta.vote_count == 0)]

Unnamed: 0_level_0,original_title,genres,budget,revenue,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [58]:
# Remove special characters from titles while keeping non-English letters
df_meta['original_title'] = df_meta['original_title'].str.replace(r'[^a-zA-Z0-9\s\u0080-\uFFFF]', '', regex=True).str.lower()

In [59]:
df_meta

Unnamed: 0_level_0,original_title,genres,budget,revenue,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
862,toy story,"[Animation, Comedy, Family]",30000000.0,373554033.0,7.7,5415.0
8844,jumanji,"[Adventure, Fantasy, Family]",65000000.0,262797249.0,6.9,2413.0
31357,waiting to exhale,"[Comedy, Drama, Romance]",16000000.0,81452156.0,6.1,34.0
949,heat,"[Action, Crime, Drama, Thriller]",60000000.0,187436818.0,7.7,1886.0
9091,sudden death,"[Action, Adventure, Thriller]",35000000.0,64350171.0,5.5,174.0
...,...,...,...,...,...,...
24049,சிவாஜி,"[Action, Comedy, Drama]",12000000.0,19000000.0,6.9,25.0
280422,все и сразу,"[Crime, Comedy]",750000.0,3.0,6.0,4.0
62757,dikari,"[Comedy, Drama]",800000.0,1328612.0,5.8,6.0
63281,про любоff,"[Romance, Drama]",2000000.0,1268793.0,4.0,3.0


In [60]:
df_meta.to_csv('movie_meta.csv')
