In [None]:
#some ideas taken from https://www.kaggle.com/vrajeshbabu/movies-correlations-eda-and-visuals

import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize']=(12,8)

In [None]:
for dirname, _, filenames in os.walk('../input/movies'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#read csv
df = pd.read_csv("../input/movies/movies.csv", sep=',', engine='python')
#drop director, star and writer
df = df.drop(['director', 'star', 'writer'],axis=1)
df

### Data cleaning

In [None]:
# find null values
df.isnull().sum() #There are no null values. We go on to check the type of the data.

In [None]:
df.dtypes
#both "budget" and "gross" contain values ​​of type float64 but do not contain decimals, so they can
#be converted to integers

In [None]:
df['budget']=df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')
df.dtypes

In [None]:
#the "year" column contains wrong values when compared to the "released" column. It is solved by copying the 4
#first characters from "released" to "year"
df['realyear'] = df['released'].astype(str).str[:4].astype('int64')
df = df.drop('year',axis=1)

In [None]:
#We eliminate possible duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)
#We checked that there were no duplicates

In [None]:
#we sort the dataset by "gross" and check the 5 highest grossing movies
df = df.sort_values(by=['gross'],ascending=False)
df.head()

In [None]:
#We calculate the gross / budget ratio
freq = df.value_counts("budget") 
print(freq) #there are 2,182 films that do not have the budget informed, so they are not used to calculate the ratio

df_nz = df.replace(0,np.nan).dropna(axis=0) #we create a subset without these movies
df_nz['budget']=df_nz['budget'].astype('int64')
df_nz['gross'] = df_nz['gross'].astype('int64')
df_nz['ratiogb'] = df_nz['gross']/df_nz['budget'] #we calculate and add the gross / budget ratio
df_nz

### Data exploration and analysis

In [None]:
corr_mat = df.corr()
sns.heatmap(corr_mat,annot=True)
plt.title("Correlations",size=30)
plt.xlabel("Features")
plt.ylabel("Features")
plt.show()
df_nz.corr()

In [None]:
# budget vs gross
sns.regplot(x='budget',y='gross',data=df_nz,scatter_kws={"color":"red"},line_kws={"color":"green"})

## By genre

In [None]:
genre_gross=df_nz.groupby(['genre'])['budget'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['budget'])
plt.title("Genre vs Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
genre_gross=df_nz.groupby(['genre'])['gross'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['gross'])
plt.title("Genre vs Gross",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
genre_gross=df_nz.groupby(['genre'])['ratiogb'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['ratiogb'])
plt.title("Genre vs Ratio Gross/Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

### Last decade

In [None]:
df_lt = df_nz.drop(df_nz[df_nz['realyear']<2010].index)
df_lt

In [None]:
# budget vs gross
sns.regplot(x='budget',y='gross',data=df_lt,scatter_kws={"color":"red"},line_kws={"color":"green"})

In [None]:
genre_gross=df_lt.groupby(['genre'])['budget'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['budget'])
plt.title("Genre vs Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
genre_gross=df_lt.groupby(['genre'])['gross'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['gross'])
plt.title("Genre vs Gross",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
genre_gross=df_lt.groupby(['genre'])['ratiogb'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['ratiogb'])
plt.title("Genre vs Ratio Gross/Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Ratio",size=15)
plt.show()

### Evolution of profitability over time (trends)

In [None]:
#I have divided the dataset by decades, adding _X to each gender, where X is the letter corresponding to the decade
#This allows me to visualize the evolution of a given gender over time to study the trend

df_a = df_nz[df_nz['realyear']<1990]
df_b = df_nz[df_nz['realyear']<2000][df_nz['realyear']>=1990]
df_c = df_nz[df_nz['realyear']<2010][df_nz['realyear']>=2000]
df_d = df_nz[df_nz['realyear']<2020][df_nz['realyear']>=2010]

df_a['genre_decade'] = df_a['genre'].astype(str)+"_A"
df_b['genre_decade'] = df_b['genre'].astype(str)+"_B"
df_c['genre_decade'] = df_c['genre'].astype(str)+"_C"
df_d['genre_decade'] = df_d['genre'].astype(str)+"_D"

df_new = pd.concat([df_a, df_b, df_c, df_d])

In [None]:
genre_gross=df_new.groupby(['genre_decade'])['ratiogb'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(200,100))
plt.bar(genre_gross['genre_decade'],genre_gross['ratiogb'])
plt.title("Genre vs Ratio Gross/Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
df_adv=df_new[df_new['genre']=="Mystery"]
genre_gross=df_adv.groupby(['genre_decade'])['ratiogb'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(16,8))
plt.bar(genre_gross['genre_decade'],genre_gross['ratiogb'])
plt.title("Genre vs Ratio Gross/Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

## Age rating

In [None]:
genre_gross=df_nz.groupby(['rating'])['ratiogb'].median()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['rating'],genre_gross['ratiogb'])
plt.title("Genre vs Ratio Gross/Budget",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Ratio",size=15)
plt.show()

## Votes, Score

In my opinion, this analysis would not make sense since they are consequences of the popularity and quality of the film, but they cannot be taken into account for the production.