# TMDb Movie Data

 Exploratory Data Analysis on movies dataset using wrangling,visualization,transformation,encoding,scaling,feature engineering,wordcloud

## Import Libraries

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
import scipy.stats as stats

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
from wordcloud import WordCloud

## Load Dataset

In [None]:
movieSet = pd.read_csv('../input/tmdb-movies-dataset/tmdb_movies_data.csv')
movieSet.head()

## Statistical Summary

##### numerical

In [None]:
movieSet.describe()

##### categorial

In [None]:
movieSet.describe(include=np.object)

## Data Wrangling

##### check null values

In [None]:
movieSet.info()

##### check and drop duplicate rows

In [None]:
movieSet.duplicated().sum()

In [None]:
movieSet.drop_duplicates(inplace=True)

##### convert to standard pandas datetime format

In [None]:
movieSet['release_date'] = pd.to_datetime(movieSet['release_date'])

In [None]:
movieSet.head()

##### dropping unnecessary columns

In [None]:
movieSet.drop(['imdb_id','homepage','tagline','overview','budget_adj','revenue_adj'],axis=1,inplace=True)

In [None]:
movieSet.head()

In [None]:
movieSet.isnull().sum()

In [None]:
movieSet[movieSet['budget'] == 0].shape

In [None]:
movieSet[movieSet['revenue'] == 0].shape

In [None]:
movieSet['budget'].replace(0,np.nan,inplace=True)

In [None]:
movieSet['revenue'].replace(0,np.nan,inplace=True)

In [None]:
movieSet['runtime'].replace(0,np.nan,inplace=True)

##### 'id' is a unique column so set it as index of dataframe.

In [None]:
movieSet['id'].nunique()

In [None]:
movieSet.set_index('id',inplace=True)

In [None]:
movieSet.columns.shape

## Most Generated Keywords

In [None]:
text = ','.join(movieSet['keywords'].str.cat(sep='|').split('|'))

In [None]:
wc = WordCloud(max_words=50,background_color='white').generate(text)
plt.figure(figsize=(15,10))
plt.imshow(wc)
plt.show()

### Q1 : Which year has the highest release of movies?

In [None]:
plt.figure(figsize=(18,10))
sns.countplot(movieSet['release_year'])
sns.set_style("darkgrid")
plt.xticks(rotation = 90)
plt.show()

### Q2: Which Movie Has The Highest Or Lowest Profit? Top 10 movies which earn highest profit?

##### Feature engineering (create new column from existing ones)

In [None]:
movieSet['profit'] = movieSet['revenue'] - movieSet['budget']

In [None]:
movieSet.head()

In [None]:
dd = movieSet[(movieSet.profit == movieSet['profit'].max()) | (movieSet.profit == movieSet['profit'].min())][['original_title','profit']]
sns.barplot(dd['original_title'],dd['profit'])
plt.show()

### Q3: Movie with Highest And Lowest Budget?

##### min-max scaler to scale the values into range (0,1)

In [None]:
db = movieSet[(movieSet.budget == movieSet['budget'].max()) | (movieSet.budget == movieSet['budget'].min())][['original_title','budget']]
db['scaled_budget'] = MinMaxScaler().fit_transform(db['budget'].values.reshape(-1,1))
db
sns.barplot(db['original_title'],db['scaled_budget'])
plt.xticks(rotation = 90)
plt.show()

### Q4: Which movie made the highest revenue and lowest as well?

In [None]:
dr = movieSet[(movieSet.revenue == movieSet['revenue'].max()) | (movieSet.revenue == movieSet['revenue'].min())][['original_title','revenue']]
dr['scaled_revenue'] = MinMaxScaler().fit_transform(dr['revenue'].values.reshape(-1,1))
dr
sns.barplot(dr['original_title'],dr['scaled_revenue'])
plt.xticks(rotation = 90)
plt.show()

### Q5. Movie with shorest and longest runtime?

In [None]:
dd = movieSet[(movieSet.runtime == movieSet['runtime'].max()) | (movieSet.runtime == movieSet['runtime'].min())][['original_title','runtime']]
sns.barplot(dd['original_title'],dd['runtime'])
plt.xticks(rotation = 90)
plt.show()

### Q6: Which movie get the highest or lowest votes (Ratings).

In [None]:
plt.figure(figsize=(20,10))
movieSet.groupby('release_year')['profit'].mean().plot(kind='bar',color='pink')
plt.show()

### Q8: Which length movies most liked by the audiences according to their popularity?

In [None]:
dd = movieSet.sort_values('popularity',ascending=False)[['popularity','runtime']].head()
sns.lineplot(dd['runtime'],dd['popularity'])
plt.show()

### Q9. Average Runtime Of Movies From Year To Year?

##### seasonality

In [None]:
plt.figure(figsize=(15,6))
movieSet.groupby('release_year')['runtime'].mean().plot(color='grey')
plt.show()

### Q10. How Does The Revenue And Popularity differs Budget And Runtime? And How Does Popularity Depends On Profit?

In [None]:
sns.scatterplot(movieSet['budget'],movieSet['revenue'])
sns.set_style("whitegrid")
plt.show()

In [None]:
sns.regplot('runtime','revenue',data=movieSet,color='violet')
plt.show()

In [None]:
sns.lmplot('budget','popularity',data=movieSet)
sns.set_style("darkgrid")
plt.show()

In [None]:
sns.scatterplot(movieSet['popularity'],movieSet['runtime'],color='c')
plt.show()

In [None]:
sns.regplot(movieSet['popularity'],movieSet['profit'],color='g')
plt.show()

### Q11: Which Month Released Highest Number Of Movies In All Of The Years? And Which Month Made The Highest Average Revenue?

In [None]:
movieSet['release_date'].dt.month.value_counts().plot.bar(color='magenta')
plt.show()

In [None]:
movieSet.groupby(movieSet['release_date'].dt.month)['revenue'].mean().plot.bar(color='lime')
plt.show()

### Q12: Which Genre Has The Highest Release Of Movies?

##### one hot encoding to split the merged genres into separate columns

In [None]:
dg = movieSet['genres'].str.get_dummies(sep='|')
dg.head()

In [None]:
dg[dg.columns].apply(lambda x: sum(x.values)).plot.pie(figsize=(20,10),autopct='%1.1f%%',explode=[0.12]*len(dg.columns))
plt.show()

### Q13: Which genres are most popular from year to year?

##### 3-axis bar graph

In [None]:
dy = movieSet.groupby('release_year')['genres'].apply(lambda x: x.str.cat(sep='|')).apply(lambda x: x.split('|'))
dy.head()

In [None]:
dy2 = dy.apply(lambda x:stats.mode(x)).reset_index()
dy2.head()

In [None]:
dy2['genre_name'] = dy2['genres'].apply(lambda x: ''.join(x[0]))
dy2['freq'] = dy2['genres'].apply(lambda x: ''.join(x[1].astype(str)))
dy2['freq'] = dy2['freq'].astype(int)
dy2.head()

In [None]:
dy3 = dy2.sort_values('freq').head(10)
fig,ax = plt.subplots(figsize=(15,10))
ax.barh(dy3.release_year,dy3.freq,color='c')
ax.set_yticks(dy3.release_year)
ax.set_yticklabels(dy3.genre_name)

ax2= ax.twinx()
ax2.barh(dy3.release_year,dy3.freq,color='c')
ax2.set_yticks(dy3.release_year)
ax2.set_yticklabels(dy3.release_year)
plt.show()

### Q14: Top 20 Production Companies With Higher Number Of Release?

In [None]:
dc = movieSet['production_companies'].str.get_dummies(sep='|')
dc.head()

In [None]:
dpc = dc[dc.columns].apply(lambda x: sum(x.values))
dpc.head()

In [None]:
dpc.sort_values(0,ascending=False).head(20).plot.pie(autopct='%1.1f%%',frame=True,shadow=True)
plt.show()

### Q15 : Top 20 Director Who Directs Maximum Movies?

In [None]:
movieSet['director'].value_counts().head(20).plot.barh(figsize=(18,10),color='brown')
plt.show()

### Q16: What Kind Of Properties Are Associated With Movies With High Revenue?

In [None]:
movieSet.sort_values('revenue',ascending=False).head(3).describe()