# Cleaning & EDA

- Train Test Split data before moving on to modeling

# Principals

- Contains the principal cast/crew for titles:

1. tconst (string) - alphanumeric unique identifier of the title
2. ordering (integer) – a number to uniquely identify rows for a given titleId
3. nconst (string) - alphanumeric unique identifier of the name/person
4. category (string) - the category of job that person was in
5. job (string) - the specific job title if applicable, else '\N'
6. characters (string) - the name of the character played if applicable, else '\N'

In [1]:
# importing the modules
import pandas as pd
import numpy as np
 
principle_df = pd.read_csv('../data/raw/principals.tsv', sep='\t')

In [2]:
principle_df.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [None]:
tsv_file = open("../data/raw/title.principals.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

## Metadata

### **Supporting Actor(s)**

In [None]:
supporting = cast_df[cast_df['order'] == 1]
supporting = supporting[['movie_id', 'name']]
supporting = supporting.rename(columns={'movie_id':'id'})
supporting = supporting.rename(columns={'name':'supporting'})
supporting.head(5)

In [None]:
imdb_metadata = pd.merge(imdb_metadata, supporting, on = 'id', how = 'left')

In [None]:
imdb_metadata.head()

## **Crew**

Create new dataframe with certain features

In [None]:
crew_df.head()

In [None]:
crew_df = crew_df[['name', 'job', 'department', 'gender', 'movie_id']]
crew_df.head()

### **Director**

In [None]:
director = crew_df[crew_df['job'] == 'Director']
director = director[['movie_id', 'name']]
director = director.rename(columns={'movie_id':'id', 'name':'director'})

In [None]:
director.head(5)

In [None]:
dataset = pd.merge(imdb_metadata, director, on = 'id', how = 'left')
print("Number of rows before dropping those with null values:",len(dataset))
#dataset.dropna(inplace = True)
print("Number of rows after dropping those with null values:",len(dataset))

### **Executive Producer**

In [None]:
Executive_Producer = crew_df[crew_df['job'] == 'Executive Producer']
Executive_Producer = Executive_Producer[['movie_id', 'name']]
Executive_Producer = Executive_Producer.rename(columns={'movie_id':'id', 'name':'Executive Producer'})

In [None]:
Executive_Producer.head(5)

## Repeat these steps for any additonal jobs, departments, or individuals you would like to examine...

list:
- Producer
- Director of Photography
- Editor
- Casting
- Screenplay
- Production Design
- Original Music Composer
- Music
- Music Supervisor
- Costume Designer
- Gaffer
- etc....

## Seaborn stacked barplot 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(12, 8))

# Load the budget and revenue data
profit = high_gp = imdb_final[['title', 'revenue', 'budget']].sort_values(by = "revenue", ascending = False).head(20)

# Plot the total revenue
sns.set_color_codes("pastel")
sns.barplot(x="revenue", y="title", data=profit,
            label="Total Revenue", color="orange")

# Plot the budget for each film
sns.set_color_codes("muted")
sns.barplot(x="budget", y="title", data=profit,
            label="Budget", color="burlywood")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="Movie Titles",
       xlabel="Total (USD Billions)",
      title='Budget v Revenue')
sns.despine(left=True, bottom=True)

## Plotly Stacked bar chart - Losses

In [None]:
# Load gross profit data
profit = imdb_final[['title', 'budget', 'revenue', 'gross_profit']].sort_values(by = 'gross_profit', ascending = True).head(20)
title = profit['title']
investment = profit['budget']
revenue = profit['revenue']
profit = profit['gross_profit']


fig = go.Figure()

fig.add_trace(go.Bar(
    y = title,
    x = budget,
    name = 'Investment',
    orientation='h',
    marker=dict(
        color='rgb(229, 196, 148)',
        line=dict(color='rgb(229, 196, 148)', width=1)
    )
)) 
fig.add_trace(go.Bar(
    y = title,
    x = profit,
    name = 'Revenue',
    orientation='h',
    marker=dict(
        color='rgb(218, 165, 27)',
        line=dict(color='rgb(218, 165, 27)', width=1)
    )
))


fig.update_layout(
    title = "Biggest Flops",
    autosize=False,
    width=1300,
    height=500,
    yaxis=dict(
        titlefont=dict(size=30),
    )
)
# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_yaxes(
    type="category",
    categoryorder="min descending")
fig.update_yaxes(automargin=True)
fig.show()