In [None]:
# Import libraries

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) # Adjusts the configuration of the plots I will create

# Read in the data

df = pd.read_csv(r'C:\Users\saida\OneDrive\Рабочий стол\Portfolio\Movies\movies.csv')

In [None]:
# Let's look at the data

df.head()

In [None]:
# Number of null values in each column

df.isnull().sum()

In [None]:
# Let's see if there is any missing data

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))
    
# Only NAME, GENRE, YEAR, DIRECTOR columns do NOT have missing values

In [None]:
df.dtypes
# Year, score, votes, budget, gross, runtime have numerical values, other columns are categorical.

In [None]:
# Replacing null values of numerical columns with median

df['score'] = df['score'].fillna(df['score'].median())
df['votes'] = df['votes'].fillna(df['votes'].median())
df['budget'] = df['budget'].fillna(df['budget'].median())
df['gross'] = df['gross'].fillna(df['gross'].median())
df['runtime'] = df['runtime'].fillna(df['runtime'].median())

In [None]:
# Replacing null values of categorical columns with mode
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['released'] = df['released'].fillna(df['released'].mode()[0])
df['writer'] = df['writer'].fillna(df['writer'].mode()[0])
df['star'] = df['star'].fillna(df['star'].mode()[0])
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['company'] = df['company'].fillna(df['company'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
# There are other ways to work with null values
# df = df.dropna() - It will drop all rows that have missing data
# df['rating'].fillna("No rating", inplace =True) It will change null values in RATING column to No rating. Example
# df.fillna(0) It will fill missing values with 0

In [None]:
# Change data type of columns BUDGET and GROSS to INTEGER

df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')
df['votes'] = df['votes'].astype('int64')

In [None]:
# Create correct year release year column

# df['year_correct'] = df['released'].astype('str').str[:4] It will take the first 4 values of an object

df['year_correct'] = df['released'].str.extract(pat = '([0-9]{4})').astype(int)

In [None]:
df.head()

In [None]:
df.sort_values(by=['gross'], inplace = False, ascending = False)

In [None]:
# This will show all rows in dataframe

pd.set_option('display.max_rows', None)

In [None]:
# Drop any duplicates

df.drop_duplicates()

In [None]:
# Budget will have high correlation - prediction
# Company will have high correlation - prediction

In [None]:
# Let's build scatterplot with Budget vs Gross

plt.scatter(x = df['budget'], y = df['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross')
plt.ylabel('Budget for Film')
plt.show()

In [None]:
# Plot Budget vs Gross using seaborn

sns.regplot(x = 'budget', y = 'gross', data = df, scatter_kws = {'color': 'red'}, line_kws = {'color': 'blue'})



In [None]:
# Let's start looking at the correlation. Correlation is used only for numerical features

df.corr(method = 'pearson') # Pearson correlation - default, other options are Kendall, Spearman

In [None]:
# Let's visualize correlation between correlation matrix

correlation_matrix = df.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Let's look at company

df.head()

In [None]:
# Let's convert string valued features into numerical ones

df_numerized = df

for col_name in df_numerized.columns:
    if (df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized.head()

In [None]:
# Let's visualize correlation between correlation matrix

correlation_matrix = df_numerized.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs

In [None]:
sorted_pairs = corr_pairs.sort_values(ascending = False)
sorted_pairs

In [None]:
high_corr = sorted_pairs[(sorted_pairs) > 0.5].sort_values(ascending = True)
high_corr

In [None]:
# Conclusion
# Votes and budget have the highest correlation to gross earnings
# My prediction regarding high correlation of Company feature to Gross earnings was WRONG.
# Company feature has low correlation to gross earnings.