In [1]:
import pandas as pd

data = pd.read_csv("monster_movies.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630 entries, 0 to 629
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           630 non-null    object 
 1   title_type       630 non-null    object 
 2   primary_title    630 non-null    object 
 3   original_title   630 non-null    object 
 4   year             630 non-null    int64  
 5   runtime_minutes  576 non-null    float64
 6   genres           624 non-null    object 
 7   simple_title     630 non-null    object 
 8   average_rating   630 non-null    float64
 9   num_votes        630 non-null    int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 49.3+ KB


In [2]:
#extract only the column tha(t will use
movies = data[['tconst','year','genres','num_votes']].copy()

#check null value in each column
movies.isnull().sum()

tconst       0
year         0
genres       6
num_votes    0
dtype: int64

In [3]:
#drop null value
movies.dropna(subset=['genres'], inplace=True)
movies['genres'].isnull().sum()

0

In [4]:
#determines which decade a specific year belongs to
#calculate starting and ending years of the decade and returns start and end year
def findDecade(year):
    startYear = (year//10)*10
    endYear = startYear + 10
    return f"{startYear}-{endYear-1}"

#convert year to int
movies['year'] = pd.to_numeric(movies['year']).astype(int)

#store decade values
movies['decade']  = movies['year'].apply(findDecade)
movies['decade'].unique()

array(['1920-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969',
       '1970-1979', '1980-1989', '2000-2009', '1990-1999', '1910-1919',
       '2010-2019', '2020-2029'], dtype=object)

In [5]:
#check number of movies in each decade to access distribution and avoid skewing
print(movies.groupby(['decade'])['tconst'].count().reset_index(name='num_movies'))

#filter decades with fewer than ten movies
movies = movies.groupby(['decade']).filter(lambda x: len(x) >= 10).reset_index(drop=True)

       decade  num_movies
0   1910-1919           1
1   1920-1929           1
2   1930-1939           3
3   1940-1949          11
4   1950-1959          24
5   1960-1969          28
6   1970-1979          24
7   1980-1989          30
8   1990-1999          45
9   2000-2009         115
10  2010-2019         231
11  2020-2029         111


In [6]:
#group by decade and genres
movies = movies.groupby(['decade','genres'])['num_votes'].sum().reset_index(name='total_votes')

#rank the genres in each decade
movies['popularity'] = movies.groupby(['decade'])['total_votes'].rank(method='dense', ascending=False).astype(int)
movies.query('decade=="1940-1949"')

Unnamed: 0,decade,genres,total_votes,popularity
0,1940-1949,"Action,Adventure,Horror",112,9
1,1940-1949,"Action,Sci-Fi",214,8
2,1940-1949,"Adventure,Crime,Drama",1938,2
3,1940-1949,"Adventure,Horror",296,7
4,1940-1949,"Crime,Drama,Horror",799,5
5,1940-1949,"Drama,Horror,Mystery",1588,3
6,1940-1949,"Drama,Horror,Romance",4211,1
7,1940-1949,"Horror,Sci-Fi",1285,4
8,1940-1949,"Horror,Sci-Fi,Thriller",504,6


In [7]:
#query the most popular genre in each decade
popular = movies.query('popularity == 1').reset_index(drop=True)
popular

Unnamed: 0,decade,genres,total_votes,popularity
0,1940-1949,"Drama,Horror,Romance",4211,1
1,1950-1959,"Horror,Sci-Fi",24354,1
2,1960-1969,"Action,Adventure,Sci-Fi",15761,1
3,1970-1979,"Horror,Sci-Fi",5265,1
4,1980-1989,"Action,Comedy,Fantasy",36216,1
5,1990-1999,"Biography,Drama",33117,1
6,2000-2009,"Adventure,Animation,Comedy",1013277,1
7,2010-2019,"Adventure,Animation,Comedy",426652,1
8,2020-2029,"Action,Adventure,Comedy",152673,1


In [8]:
#visualize
import plotly.express as px

fig = px.bar(popular, x="decade", y="total_votes", color="genres",
             title="Popular Genres by Decade")
fig.show()