# Analysis of The Movies Database

## Import data and packages

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

%matplotlib inline
pd.set_option('display.max_columns', None) #set full rows

In [14]:
df_movies_metadata = pd.read_csv(r'./DATA/45000_plus_movies/movies_metadata.csv')
df_credits = pd.read_csv(r'./DATA/45000_plus_movies/credits.csv')
df_ratings = pd.read_csv(r'./DATA/45000_plus_movies/ratings.csv')
df_cpi = pd.read_csv(r'./DATA/cpi/API_FP.CPI.TOTL.ZG_DS2_en_csv_v2_988671.csv',skiprows=4)
df_gdp = pd.read_csv(r'./DATA/gdp/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_988718.csv',skiprows=4)

print(df_movies_metadata.memory_usage(deep=True).sum()/1000000)
print(df_credits.memory_usage(deep=True).sum()/1000000)
print(df_ratings.memory_usage(deep=True).sum()/1000000)
print(df_cpi.memory_usage(deep=True).sum()/1000000)
print(df_gdp.memory_usage(deep=True).sum()/1000000)


Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.



83.471315
203.294768
832.777376
0.206704
0.201424


## Prepare inflation by year

In [45]:
temp_list = [x for x in df_cpi.columns.tolist() if len(x)<5]

df_cpi_gdp = pd.merge(df_cpi, df_gdp, left_on='Country Name',right_on='Country Name')

for this_year in temp_list:
    df_cpi_gdp["{}_weighted_cpi".format(this_year)] = df_cpi_gdp["{}_x".format(this_year)] * df_cpi_gdp["{}_y".format(this_year)]
    
weighted_cols = [x for x in df_cpi_gdp.columns.tolist() if x.endswith("_weighted_cpi")]    

dicts_cpi = {}
for this_gdp_col, this_wave_col in zip(temp_list,weighted_cols):
    dicts_cpi[this_wave_col] = df_cpi_gdp[this_wave_col].sum() / df_cpi_gdp["{}_y".format(this_gdp_col)].sum()
    
dicts_cpi    


invalid value encountered in double_scalars



{'1960_weighted_cpi': 0.8332556738934528,
 '1961_weighted_cpi': 0.9315936763270694,
 '1962_weighted_cpi': 1.5221661199137895,
 '1963_weighted_cpi': 1.3024874980853909,
 '1964_weighted_cpi': 1.521678146139268,
 '1965_weighted_cpi': 1.7594154026790947,
 '1966_weighted_cpi': 1.9069954589430467,
 '1967_weighted_cpi': 1.7381827144201405,
 '1968_weighted_cpi': 1.9554597917338292,
 '1969_weighted_cpi': 1.8532967294159093,
 '1970_weighted_cpi': 2.520879990730323,
 '1971_weighted_cpi': 2.8091012769128882,
 '1972_weighted_cpi': 3.034524046010322,
 '1973_weighted_cpi': 4.801280725576881,
 '1974_weighted_cpi': 7.970002159173341,
 '1975_weighted_cpi': 6.413744092237752,
 '1976_weighted_cpi': 4.948858163311302,
 '1977_weighted_cpi': 6.6278603656939,
 '1978_weighted_cpi': 5.590222715851748,
 '1979_weighted_cpi': 6.327326932073842,
 '1980_weighted_cpi': 8.595804408250158,
 '1981_weighted_cpi': 11.35285057120662,
 '1982_weighted_cpi': 9.082242627755974,
 '1983_weighted_cpi': 7.966561177098277,
 '1984_w