### Plan

1. Data cleanup
    - Formatting issues
    - Missing values
 

2. Exploration (EDA)


3. Evaluation
    - Visualizations
    - Outputs (Final Recommendations)

### Import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import movie_functions as mf
import numpy as np

### 1. Data Cleanup

### Read data

- Using data sets from IMDB and TheMovieDB

In [None]:
#The Movie DB
df_budgets = pd.read_csv("data/tn.movie_budgets.csv")
df_popularity = pd.read_csv("data/tmdb.movies.csv")

#IMDB
df_basics = pd.read_csv("data/imdb.title.basics.csv")

### Update data formats

##### Turn numeric object from string to integer

In [None]:
for column in df_budgets.columns[-3:]:
    df_budgets[column] = mf.scale_to_millions(df_budgets, column)

### Merge datasets

##### 1) Rename

In [None]:
df_budgets.rename(columns={"movie": "title"}, inplace=True)
df_basics.rename(columns={"primary_title": "title"}, inplace=True)

##### 2) Merge

In [None]:
df_movies = pd.merge(df_popularity, df_budgets, how="left", on="title")
df_movies = pd.merge(df_basics, df_movies, how="left", on="title")

### Drop Columns

In [None]:
df_movies.drop(columns=[
                "id_x", "Unnamed: 0", "genre_ids",
                "original_title_x", "original_language",
                "release_date_x", "id_y", "release_date_y", "tconst",
                "runtime_minutes", "original_title_y", "start_year"
                ],
               axis=1, inplace=True)

### New Columns

In [None]:
df_movies["foreign_gross"] = ((df_movies.worldwide_gross
                               - df_movies.domestic_gross))

In [None]:
df_movies["net_profit"] = ((df_movies.worldwide_gross
                            - df_movies.production_budget))

In [None]:
df_movies

### 2. Exploration

##### Calculate Mean of DataFrame without Outliers

1) Find Interquartile Range to calculate Outliers

In [None]:
genre = ["Action", "Adventure", "Animation", "Comedy", "Crime", "Drama",
         "Fantasy", "Horror", "Mystery", "Romance", "Sci-Fi", "Thriller"]

2) Create new DataFrame of Means for each column based on Genre

In [None]:
df_mean = pd.DataFrame(genre, columns=['genres'])
df_mean.set_index('genres', inplace=True)

In [None]:
for column in df_movies.columns[2:]:
    df_mean[column] = mf.calculate_mean(df_movies, column, genre)

df_mean

##### Calculate weighted rankings for each genre

    - Conservative 
    - Compromised
    - Aggressive

In [None]:
df_rankings = pd.DataFrame(genre, columns=["genres"])
df_rankings.set_index('genres', inplace=True)

In [None]:
df_rankings["conservative"] = mf.conservative_ranking(df_rankings.index, df_mean,
                                                   "production_budget", "domestic_gross")

df_rankings["compromised"] = mf.compromised_ranking(df_rankings.index, df_mean,
                                                 "production_budget", "domestic_gross", "foreign_gross")

df_rankings["aggressive"] = mf.aggressive_ranking(df_rankings.index, df_mean,
                                               "popularity", "vote_average", "vote_count")

df_rankings

In [None]:
ranking_type = ["conservative", "conservative",
                "compromised", "compromised", "compromised",
                "aggressive", "aggressive", "aggressive"]
df_percent_weight = pd.DataFrame(ranking_type, columns={"ranking_type"})

df_percent_weight["inputs"] = ["production_budget", "domestic_gross",
                                 "production_budget", "domestic_gross", "foreign_gross",
                                "popularity", "vote_average", "vote_count"]

df_percent_weight["percent"] = [40, 60, 20, 40, 40, 30, 40, 30]

df_percent_weight

### Evaluation: Our recommended movie genres


##### Visualizations

In [None]:
# Display weight percentages
mf.percent_weight_bar_graph(df_percent_weight,
                         "ranking_type", "percent", "inputs")

- Conservative

In [None]:
#Display weighted sum of avg per film genre and the rankings(min and max, highlighted in red)
mf.weighted_ranking_bar_graph(df_rankings,
                          "conservative", "Conservative Ranking")

In [None]:
#Display distribution of the weighted sums of avg
mf.weighted_ranking_box_plot(df_rankings,
                          "conservative", "Conservative Ranking")

- Compromised

In [None]:
#Display weighted sum of avg per film genre and the rankings(min and max, highlighted in red)
mf.weighted_ranking_bar_graph(df_rankings,
                           "compromised", "Compromised Ranking")

In [None]:
#Display distribution of the weighted sums of avg
mf.weighted_ranking_box_plot(df_rankings,
                          "compromised", "Compromised Ranking")

- Aggressive

In [None]:
#Display weighted sum of avg per film genre and the rankings(min and max, highlighted in red)
mf.weighted_ranking_bar_graph(df_rankings,
                           "aggressive", "Aggressive Ranking")

In [None]:
#Display distribution of the weighted sums of avg
mf.weighted_ranking_box_plot(df_rankings,
                          "aggressive", "Aggressive Ranking")

##### Outputs

- The min and max ranks per scenario and their weighted sums of avg (from Sensitivity Analysis)

In [None]:
mf.min_and_max_values(df_rankings, "conservative")

In [None]:
mf.min_and_max_values(df_rankings, "compromised")

In [None]:
mf.min_and_max_values(df_rankings, "aggressive")