In [9]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('movies_temp.csv')

In [10]:
df.columns

Index(['updated_at', 'movie_name', 'release_date', 'tvdb_id', 'slug', 'image',
       'score', 'runtime', 'genres', 'budget', 'boxOffice', 'originalCountry',
       'studios', 'awards', 'tagOptions', 'contentRatings', 'companies',
       'production_countries'],
      dtype='object')

In [11]:
# barplot of originalCountry
countries_count = df['originalCountry'].value_counts().reset_index()
countries_count.columns = ['country', 'count']
countries_count = countries_count.sort_values('count', ascending=False)

fig = px.bar(countries_count, x='country', y='count', text='count')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(title_text='Number of movies per country')
fig.show()

In [21]:
# movies from germany
m = df[df['originalCountry'] == 'kaz']
m

Unnamed: 0,updated_at,movie_name,release_date,tvdb_id,slug,image,score,runtime,genres,budget,boxOffice,originalCountry,studios,awards,tagOptions,contentRatings,companies,production_countries
501,2019-11-04 20:04:22,The Mist,0001-01-01 00:00:00,326887,aksuat,https://artworks.thetvdb.com/banners/movies/32...,0,78.0,"[{'id': 12, 'name': 'Drama', 'slug': 'drama'}]",,,kaz,,,,,"{'studio': [], 'network': [], 'production': [{...",[]


In [22]:
known_budget = df[df['budget'] != 0]
fig = px.box(known_budget, x="originalCountry", y="budget", color="budget", points="all", hover_data=['slug'])
#fig.update_layout(scattermode="group", scattergap=0.75)
fig.show()

In [23]:
known_boxOffice = df[df['boxOffice'] != 0]
fig = px.box(known_boxOffice, x="originalCountry", y="boxOffice", color="boxOffice", points="all", hover_data=['slug'])
#fig.update_layout(scattermode="group", scattergap=0.75)
fig.show()

In [15]:
# correlation between budget and boxOffice
valid = df[(df['budget'] != 0) & (df['boxOffice'] != 0)]
fig = px.scatter(valid, x="budget", y="boxOffice", hover_data=['slug'], trendline="ols")
fig.show()


A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5



In [16]:
# small multiple based on originalCountry
# only countries with more than 10 movies
valid = df[(df['budget'] != 0) & (df['boxOffice'] != 0)]
countries_count = valid['originalCountry'].value_counts().reset_index()
countries_count.columns = ['country', 'count']
countries_count = countries_count.sort_values('count', ascending=False)
valid = valid[valid['originalCountry'].isin(countries_count[countries_count['count'] > 10]['country'])]
fig = px.scatter(valid, x="budget", y="boxOffice", hover_data=['slug'], trendline="ols", facet_col="originalCountry")
fig.show()

In [26]:
# plot release date vs budget
valid = df[df['budget'] != 0]
# exclude the ones before 1900
#valid = valid[valid['release_date'] > '1940-01-01']
fig = px.scatter(valid, x="release_date", y="budget", hover_data=['slug'])
fig.show()