In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
df = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")
df

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.drop(columns=['id','overview','homepage'], inplace=True)

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df['budget']=df['budget'].replace(0,np.NAN)

In [None]:
df.isnull().sum()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# visualizing each variable
df.hist(figsize=(15,15))

****

# Exploration with visuals

# Question 1: Budget vs Popularity

In [None]:
x = df['budget']
y = df['popularity']

plt.scatter(x,y)
plt.title("Average popularity by different budgets")
plt.xlabel('budget', fontsize=10)
plt.ylabel('popularity',fontsize=10)

In [None]:
# The budget will be divided into two groups: low and high
m = df['budget'].median()
low_budget = df.query('budget < {}'.format(m))
high_budget = df.query('budget >= {}'.format(m))

In [None]:
# mean values of low and high budget values
mean_popularity_of_low_budget = low_budget['popularity'].mean()
mean_popularity_of_high_budget = high_budget['popularity'].mean()

In [None]:
# Creating a bar graph for above values
locations = [1,2]
heights = [mean_popularity_of_low_budget, mean_popularity_of_high_budget]
labels = ['low','high']
plt.bar(locations,heights, tick_label = labels)
plt.title('Average popularity of different budget')
plt.xlabel('Budgets')
plt.ylabel('Average popularity')

**Conclusion: High budget movies are more likely to be popular than low budget movies by more than 50%.**

# Question 2: Duration of the movie vs Popularity

In [None]:
# Three groups are created to divide the duartion in short, medium and large with duration as < 60 mins, 60 mins <= 120 mins, and >120 mins respectively

short = df.query('runtime < {}'.format(100))
medium = df.query('runtime < {}'.format(200))
large = df.query('runtime < {}'.format(200))

In [None]:
# mean popularity of each movie duration category
mean_popularity_of_short = short['popularity'].mean()
mean_popularity_of_medium = medium['popularity'].mean()
mean_popularity_of_large = large['popularity'].mean()

In [None]:
locations = [1,2,3]
heights = [mean_popularity_of_short, mean_popularity_of_medium, mean_popularity_of_large]
labels = ['short', 'medium','large']
plt.bar(locations, heights,tick_label= labels)
plt.title('Average popularity by duration')
plt.xlabel('Runtime')
plt.ylabel('Average popularity')

In [None]:
# relation between duration and popularity
x = df['runtime']
y = df['popularity']

plt.scatter(x,y)
plt.title('Average popularity by duration')
plt.xlabel('runtime', fontsize=10)
plt.ylabel('popularity',fontsize=10)

**Conclusion: It can be deduced that the movies within the duration of 200 minutes will be more popular**

# Question 3: Profit vs Popularity

In [None]:
m_popularity = df['popularity'].median()
lower_popularity = df.query('popularity < {}'.format(m_popularity))
higher_popularity = df.query('popularity < {}'.format(m_popularity))

In [None]:
# Create a new profit column
df['profit'] = df['revenue']- df['budget']
# df['profit'].head(10)

In [None]:
df.head(5)

In [None]:
# Average popularity 
mean_profit_of_low_popularity = lower_popularity['profit'].mean()
mean_profit_of_high_popularity = higher_popularity['profit'].mean()

In [None]:
# Visualizing above values
locations=[1,2]
heights = ['mean_profit_of_low_popularity', 'mean_profit_of_high_popularity']
labels=['low','high']
plt.bar(locations, heights, tick_label = labels)
plt.title("Average profit by popularity")
plt.xlabel('Popularity')
plt.ylabel('Average profit')

**Conclusion: Higher popularity does make higher profits**

#  What are the features associated with top 10 revenue movies?


In [None]:
top10_revenue = df.nlargest(10, 'revenue')
top10_revenue.hist(figsize=(15,15))