# Movies Gross Correlation

### This project focuses on finding out which fields are highly correlated to the gross revenue of the film.

## Import libraries

In [None]:
from matplotlib.pyplot import figure
import pandas as pd
import seaborn as sns  # data visualization library based on matplotlib
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 8)  # config of plots


## Read the data

In [None]:
df = pd.read_csv('movies.csv')


## Go through the data

In [None]:
print(df.shape)


There are 7668 rows and 15 columns in the dataframe.

Now, Read the first five data from the dataframe

In [None]:
# df.head(60)
df.head()


## Cleaning the data

#### See if there is any missing data

In [None]:
for col in df.columns:
    missing_percent = np.mean(df[col].isnull()) * 100
    print(f'{col} - {round(missing_percent, 2)}%')


#### Check the individual sum of all null values in the columns.

In [None]:
df.isnull().sum()


#### Drop the null value rows and assign it to the new dataframe

In [None]:
df2 = df.copy().dropna()


#### Check the data types for our columns

In [None]:
df2.dtypes


We don't need the fractional values. So, change the datatype of the budget and gross columns to int.

In [None]:
df2['budget'] = df2['budget'].astype('int64')
df2['gross'] = df2['gross'].astype('int64')


The year column and the release date year is not always matching.

So, take the year from the release date and create new column to store it

In [None]:
df3 = df2.copy()

# Extract only the year using regular expression for correct year column
df3['yearcorrect'] = df3['released'].str.extract(r'(\d{4})')

print(df3['yearcorrect'], df3['year'])


Display the data sorted on gross value

In [None]:
df3.sort_values(by=['gross'], inplace=False, ascending=False)


Set option to display all the rows

In [None]:
# pd.set_option('display.max_rows', None)


#### Drop any duplicates

In [None]:
df3.drop_duplicates()


Budget may have high correlation.

Company profile too may have high correlation with gross.

## Scatter plot

### Plot for budget vs gross

In [None]:
plt.scatter(x=df3['budget'], y=df3['gross'])
plt.title('Budget vs Gross')
plt.xlabel('Budget')
plt.ylabel('Gross Earnings')

plt.show()


### Plot budget vs gross using seaborn

In [None]:
# Plot the regression line
sns.regplot(data=df3, y='gross', x='budget', scatter_kws={
            "color": "blue"}, line_kws={"color": "yellow"})


## Looking at correlation

Default is Pearson

Kendall, Spearman

In [None]:
df3.corr(numeric_only=True)
# df3.corr(numeric_only=True, method='spearman')
# df3.corr(numeric_only=True, method='kendall')


High correlation between budget and gross

In [None]:
correlation_matrix = df3.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation matrix')
plt.xlabel('Movies features')
plt.ylabel('Movies features')
plt.show()


Now look at company

In [None]:
df.head()


### Numerize the dataframe

In [None]:
df_numerized = df3

for col in df_numerized.columns:
    if df_numerized[col].dtype == 'object':
        df_numerized[col] = df_numerized[col].astype('category')
        df_numerized[col] = df_numerized[col].cat.codes  # give numeric codes

print(df_numerized)


Now, since all the columns are numerized, again check the correlation matrix, which now contains all the columns

In [None]:
correlation_matrix = df3.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation matrix')
plt.xlabel('Movies features')
plt.ylabel('Movies features')
plt.show()


In [None]:
df_numerized.corr()


#### See the highest correlation quickly

In [None]:
pd.set_option('display.max_rows', None)
correlation_matrix = df_numerized.corr()
corr_pairs = correlation_matrix.unstack()
print(corr_pairs)


In [None]:
sorted_pairs = corr_pairs.sort_values()
print(sorted_pairs)


In [None]:
# Filter the Series to include only values within the desired range
high_correlation = sorted_pairs[np.logical_and(
    sorted_pairs > 0.5, sorted_pairs < 1)]

# Exclude specific columns like 'year' and 'yearcorrect' from the filtered Series
exclude_columns = ['year', 'yearcorrect']
high_correlation = high_correlation.drop(exclude_columns)

# Print the filtered Series
print(high_correlation.sort_values(ascending=False))


## Conclusion:
### Votes and Budget have the highest correlation to gross earnings