### Plan

1. Data cleanup
    - Formatting issues
    - Missing values
 

2. Exploration
    - Visualizations

### Import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Functions

In [None]:
def turn_to_int(dataframe, column):
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("$", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(",", ""))
    dataframe[column] = dataframe[column].astype(int)

In [None]:
def turn_to_obj(dataframe, column):
    dataframe[column] = dataframe[column].astype(str)

In [None]:
def drop_columns(dataframe, column):
    dataframe.drop(column, inplace=True, axis=1)

In [None]:
def merge_dataframes(df_1, df_2):
    df_movies = pd.merge(df_1, df_2, how="left", on="title")
    
    return df_movies

In [None]:
def calculate_interquartile_range(dataframe, column):
    return (dataframe[column].describe().iloc[6]
            - dataframe[column].describe().iloc[4])

In [None]:
def drop_outliers(dataframe, column):
    lower_outlier = (dataframe[column].describe().loc["25%"]
                     - (1.5 * calculate_interquartile_range(data, column)))
    upper_outlier = (dataframe[column].describe().loc["75%"]
                     + (1.5 * calculate_interquartile_range(data, column)))
    
    return dataframe[(dataframe[column] > lower_outlier)
                & (dataframe[column] < upper_outlier)]

In [None]:
def calculate_mean(dataframe, column, genres):
    mean_list = []
    df_without_outlier = drop_outliers(dataframe, column)
    
    for genre in genres:
        mean = round(df_without_outlier[df_without_outlier["genres"].str.contains(genre)==True].mean(),2)
        mean_list.append(mean[column])
    
    return mean_list

In [None]:
def conservative_ranking(genre, col1, col2):
    weighted_output = []
    
    for film in genre:
        weighted_sum = ((0.4 * df_mean.loc[film, col1])
                        + (0.6 * df_mean.loc[film, col2]))
        weighted_output.append(weighted_sum)
    
    return weighted_output

In [None]:
def compromised_ranking(genre, col1, col2, col3):
    weighted_output = []
    
    for film in genre:
        weighted_sum = ((0.2 * df_mean.loc[film, col1])
                        + (0.4 * df_mean.loc[film, col2])
                        + (0.4 * df_mean.loc[film, col3]))
        weighted_output.append(weighted_sum)
    
    return weighted_output

In [None]:
def aggressive_ranking(genre, col1, col2, col3):
    weighted_output = []
    
    for film in genre:
        weighted_sum = ((0.3 * df_mean.loc[film, col1])
                        + (0.4 * df_mean.loc[film, col2])
                        + (0.3 * df_mean.loc[film, col3]))
        weighted_output.append(weighted_sum)
    
    return weighted_output

In [None]:
# Percent Weight Bar Graph
def percent_weight_bar_graph(dataframe, x_value, y_value, hue_value):
    g = sns.catplot(x=x_value, y=y_value, hue=hue_value, data=dataframe, kind="bar")
    sns.despine(left=False, bottom=False)
    
    g.fig.set_size_inches(10, 5)
    
    plt.ylabel("Weight Percentages")
    plt.xlabel("Ranking Type")
    plt.title("Percent of Weights")
    
    plt.subplots_adjust(top=0.9)
    
    plt.savefig("percent_of_weights_bar.png")
    
    plt.show()

In [None]:
def weighted_ranking_bar_graph(dataframe, column, title):
    plt.figure(figsize = (12,7))

    ordered_rankings = dataframe.sort_values(by=[column])
    values = np.array(ordered_rankings[column]) 
    clrs = ['dodgerblue' if (x < max(values) and x > min(values)) else 'salmon' for x in values]

    sns.barplot(ordered_rankings.index, values, palette = clrs)
    sns.despine(left=False, bottom=False)
    
    plt.ylabel("Weighted Sum of Averages")
    plt.xlabel("Genres")
    plt.title(title)
    
    plt.savefig(f"{title}_bar.png")
    
    plt.show()

In [None]:
def weighted_ranking_box_plot(dataframe, column, title):
    sns.boxplot(dataframe[column], color="lightsalmon")
    sns.despine(left=True)
    
    plt.xlabel("Weighted Sum of Averages")
    plt.title(f"{title} Distribution")
    
    plt.savefig(f"{title}_box.png")
    
    plt.show()

In [None]:
def min_and_max_values(dataframe, column):
    return dataframe.loc[(dataframe[column] == dataframe[column].min())
                    | (dataframe[column] == dataframe[column].max())][column]

### 1. Data Cleanup

### Read data

In [None]:
df_budgets = pd.read_csv("files/tn.movie_budgets.csv")
df_popularity = pd.read_csv("files/tmdb.movies.csv")
df_basics = pd.read_csv("files/imdb.title.basics.csv")

### Update Formats

##### 1) Turn numeric object from string to integer

In [None]:
turn_to_int(df_budgets, "production_budget")
turn_to_int(df_budgets, "domestic_gross")
turn_to_int(df_budgets, "worldwide_gross")

##### 2) Turn non-numeric object from integer to string

In [None]:
turn_to_obj(df_basics, "start_year")

### Drop Columns


In [None]:
drop_columns(df_popularity, ["id", "Unnamed: 0", "genre_ids",
                             "original_title", "original_language"])

In [None]:
drop_columns(df_budgets, ["id", "release_date"])

In [None]:
drop_columns(df_basics, ["tconst", "runtime_minutes", "original_title"])

### Merge datasets

##### 1) Rename

In [None]:
df_budgets.rename(columns = {"movie": "title"}, inplace=True)

In [None]:
df_basics.rename(columns = {"primary_title": "title"}, inplace=True)

##### 2) Merge

In [None]:
df_movie = merge_dataframes(df_popularity, df_budgets)
df_movies_final = merge_dataframes(df_basics, df_movie)

### Missing Values

In [None]:
df_movies_final.release_date.fillna(df_movies_final.start_year, inplace=True)

In [None]:
drop_columns(df_movies_final, "start_year")

### New Columns

In [None]:
df_movies_final["foreign_gross"] = ((df_movies_final.worldwide_gross
                                     - df_movies_final.domestic_gross))

In [None]:
df_movies_final["net_profit"] = ((df_movies_final.worldwide_gross
                                  - df_movies_final.production_budget))

### 2. Exploration

##### Calculate Mean of DataFrame without Outliers

1) Find Interquartile Range to calculate Outliers

In [None]:
calculate_interquartile_range(df_movies_final,
                              ["popularity", "vote_average", "vote_count"])

In [None]:
genre = ["Action", "Adventure", "Animation", "Comedy", "Crime", "Drama",
         "Fantasy", "Horror", "Mystery", "Romance", "Sci-Fi", "Thriller"]

2) Create new DataFrame of Means for each column based on Genre

In [None]:
df_mean = pd.DataFrame(genre, columns=['genres'])
df_mean.set_index('genres', inplace=True)

In [None]:
df_mean["production_budget"] = calculate_mean(df_movies_final, "production_budget", genre)

df_mean["domestic_gross"] = calculate_mean(df_movies_final, "domestic_gross", genre)

df_mean["foreign_gross"] = calculate_mean(df_movies_final, "foreign_gross", genre)

df_mean["worldwide_gross"] = calculate_mean(df_movies_final, "worldwide_gross", genre)

df_mean["net_profit"] = calculate_mean(df_movies_final, "net_profit", genre)

df_mean["popularity"] = calculate_mean(df_movies_final, "popularity", genre)

df_mean["vote_average"] = calculate_mean(df_movies_final, "vote_average", genre)

df_mean["vote_count"] = calculate_mean(df_movies_final, "vote_count", genre)

df_mean

##### Calculate weighted rankings for each genre

    - Conservative 
    - Compromised
    - Aggressive

In [None]:
df_rankings = pd.DataFrame(genre, columns=["genres"])
df_rankings.set_index('genres', inplace=True)

In [None]:
df_rankings["conservative"] = conservative_ranking(df_rankings.index, "production_budget", "domestic_gross")

df_rankings["compromised"] = compromised_ranking(df_rankings.index, "production_budget", "domestic_gross", "foreign_gross")

df_rankings["aggressive"] = aggressive_ranking(df_rankings.index, "popularity", "vote_average", "vote_count")

df_rankings