In [None]:
# Filtering out the warnings

import warnings

warnings.filterwarnings('ignore')

In [None]:
# Importing the required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

##  Task 1: Reading the data

- ### Subtask 1.1: Read the Movies Data.

Read the movies data file provided and store it in a dataframe `movies`.

In [None]:
# Read the csv file using 'read_csv'. Please write your dataset location here.
movies = pd.read_csv('../input/imdb-movie-database-top-100-movies/MovieAssignmentData.csv')
movies.head()

- ###  Subtask 1.2: Inspect the Dataframe

Inspect the dataframe for dimensions, null-values, and summary of different numeric columns.

In [None]:
# Check the number of rows and columns in the dataframe

movies.shape

In [None]:
# Check the column-wise info of the dataframe

movies.info()

In [None]:
# Check the summary for the numeric columns 

movies.describe()

## Task 2: Data Analysis

-  ###  Subtask 2.1: Reduce those Digits!

These numbers in the `budget` and `gross` are too big, compromising its readability. Let's convert the unit of the `budget` and `gross` columns from `$` to `million $` first.

In [None]:
# Divide the 'gross' and 'budget' columns by 1000000 to convert '$' to 'million $'

movies['budget']= (movies['budget'].astype(float)/1000000).round(2).astype(float)
movies['Gross'] = (movies['Gross'].astype(float)/1000000).round(2).astype(float)

movies.head()

-  ###  Subtask 2.2: Let's Talk Profit!

In [None]:
# Create the new column named 'profit' by subtracting the 'budget' column from the 'gross' column

movies['profit'] = movies['Gross']- movies['budget']

movies.head()

In [None]:
# Sort the dataframe with the 'profit' column as reference using the 'sort_values' function. Make sure to set the argument
#'ascending' to 'False'
movies.sort_values(by ='profit', ascending=False)



In [None]:
top10 = movies.sort_values(by ='profit', ascending=False).iloc[0:10,:]
top10.reset_index(drop= True, inplace=True)
top10

In [None]:
#Plot profit vs budget

sns.set_style('darkgrid')
sns.jointplot(movies.profit,movies.budget, kind='reg',joint_kws = {'scatter_kws':dict(alpha=0.7)}, height=10)

plt.title("Profit vs Budget (units - Million $)\n\n\n", fontdict={'fontsize': 35, 'fontweight' : 5, 'color' : 'Green'})
plt.xlabel("Profit", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Budget", fontdict={'fontsize': 20, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

## Observation 
##### 1- There are few movies with negative profit despite of being in top 100 rated movies.
##### 2- There are more lower budget movies in the top rated movies and fewer high budget movies.
##### 3- High budget movies doesn't always make high profit.


The dataset contains the 100 best performing movies from the year 2010 to 2016. However, the scatter plot tells a different story. You can notice that there are some movies with negative profit. Although good movies do incur losses, but there appear to be quite a few movie with losses. What can be the reason behind this? Lets have a closer look at this by finding the movies with negative profit.

In [None]:
#Find the movies with negative profit
neg_profit = movies[(movies['profit']<0)]
neg_profit.reset_index(drop= True, inplace=True)
neg_profit

- ### Subtask 2.3: The General Audience and the Critics    

In [None]:
# Change the scale of MetaCritic
movies['MetaCritic'] = movies['MetaCritic']/10
movies.MetaCritic.head()

In [None]:
# Find the average ratings
movies['Avg_rating'] = (movies['MetaCritic'] + movies['IMDb_rating'])/2
movies.head()

In [None]:
#Sort in descending order of average rating
movies.sort_values(by ='Avg_rating',ascending=False)


In [None]:
# Find the movies with metacritic-Imdb rating < 0.5 and also with an average rating of >= 8 (sorted in descending order)

UniversalAcclaim=movies[(abs(movies["MetaCritic"]-movies['IMDb_rating'])<0.5) & (movies["Avg_rating"] >= 8)]
UniversalAcclaim.sort_values(by = "Avg_rating", ascending= False, inplace= True)
UniversalAcclaim.reset_index(drop= True, inplace=True)

UniversalAcclaim

- ### Subtask 2.4: Find the Most Popular Trios - I

In [None]:
# Write your code here

movies['popular_trios']=movies['actor_1_facebook_likes']+movies['actor_2_facebook_likes']+movies['actor_3_facebook_likes']

movies.sort_values(by='popular_trios',ascending=False, ignore_index = True).loc[0:4,['actor_1_name','actor_2_name','actor_3_name']].values.tolist()

- ### Subtask 2.6: Runtime Analysis

In [None]:
# Runtime histogram/density plot

plt.figure(figsize=[15,8])
sns.distplot(movies.Runtime, color='g').set_title("Movie RunTime Range", fontsize=40, color='Green')
plt.xlabel("RunTime", fontdict={'fontsize': 25, 'fontweight' : 5, 'color' : 'Brown'})

plt.show()

**`Checkpoint 3:`** Most of the movies appear to be sharply 2 hour-long.

- ### Subtask 2.7: R-Rated Movies

In [None]:
# Write your code here
PopularR = movies[movies['content_rating']=='R'].sort_values(by='CVotesU18', ascending = False, ignore_index=True)[0:10]
PopularR

## Task 3 : Demographic analysis

-  ###  Subtask 3.1 Combine the Dataframe by Genres

In [None]:
# Create the dataframe df_by_genre
df_by_genre = movies.loc[:, [i for i in movies.columns if i.startswith('CVotes')|i.startswith('Votes')|i.startswith('genre') ]]
df_by_genre

In [None]:
# Create a column cnt and initialize it to 1
df_by_genre['cnt']=1
df_by_genre

In [None]:
# Group the movies by individual genres
df_by_g1 = df_by_genre.groupby(['genre_1']).sum()
df_by_g2 = df_by_genre.groupby(['genre_2']).sum()
df_by_g3 = df_by_genre.groupby(['genre_3']).sum()

In [None]:
# Add the grouped data frames and store it in a new data frame
df_add = df_by_g1.add(df_by_g2,fill_value=0)
df_add = df_add.add(df_by_g3,fill_value=0)
df_add

In [None]:
# Extract genres with atleast 10 occurences
genre_top10 = df_add[df_add['cnt']>=10]
cnt = genre_top10['cnt']
genre_top10['cnt']

In [None]:
# Take the mean for every column by dividing with cnt 
genre_top10 = genre_top10.div(genre_top10['cnt'], axis='index')
genre_top10

In [None]:
# Rounding off the columns of Votes to two decimals

temp = [i for i in genre_top10.columns if i.startswith('Votes')]
genre_top10[temp] = genre_top10[temp].apply(lambda x : round(x,2))

genre_top10

In [None]:
# Converting CVotes to int type

temp = [i for i in genre_top10.columns if i.startswith('CVotes')]
genre_top10[temp] = genre_top10[temp].astype('int')

genre_top10

-  ###  Subtask 3.2: Genre Counts!

In [None]:
# Countplot for genres

plt.figure(figsize = [15,10])
genre_top10['cnt'] = cnt
sns.barplot(x=genre_top10.index, y = genre_top10.cnt).set_title("Genres and its counts", fontsize=40, color='Green')

plt.xticks(rotation=45)
plt.xlabel('Genre', fontsize= 20, color='Brown')
plt.ylabel('Count', fontsize= 20, color='Brown')

plt.show()

-  ###  Subtask 3.3: Gender and Genre

In [None]:
# 1st set of heat maps for CVotes-related columns

f,(ax1,ax2,axcb) = plt.subplots(1,3,gridspec_kw={'width_ratios':[30,30, 1.5]})
ax1.get_shared_y_axes().join(ax1,ax2)
f.suptitle('Gender wise Age group Vs Genres',fontsize = 35, color='Purple' )

a = sns.heatmap(genre_top10.groupby(genre_top10.index)['CVotesU18M','CVotes1829M', 'CVotes3044M', 'CVotes45AM'].mean(),cmap="PiYG",annot=True,cbar=False, fmt ='d',ax=ax1,linewidths=0.5)
a.axes.set_title('Male age group(CVotes) vs Genres', fontsize = 20, color='Navy')
a.set_ylabel('Genres', fontsize = 20, color='Brown')
a.set_xlabel('Male Age Group', fontsize = 20, color='Brown')

b = sns.heatmap(genre_top10.groupby(genre_top10.index)['CVotesU18F','CVotes1829F', 'CVotes3044F', 'CVotes45AF'].mean(), cmap="PiYG", annot = True, fmt ='d',linewidths=0.5, cbar_ax=axcb, ax=ax2)
b.axes.set_title('Female age group(CVotes) vs Genres', fontsize = 20, color='NAvy')
b.set_xlabel('Female Age Group', fontsize = 20, color='Brown')
b.set_yticks([])

f.set_figheight(10)
f.set_figwidth(15)

plt.show()

**`Inferences:`** A few inferences that can be seen from the heatmap above is that males have voted more than females, and Sci-Fi appears to be most popular among the 18-29 age group irrespective of their gender. What more can you infer from the two heatmaps that you have plotted? Write your three inferences/observations below:
- Inference 1: **Voters under 18 and above 45 in both the genders have watched/voted less in all genres. Especially voters under 18 are very low in number.**  

- Inference 2: **Both male and female between 18 and 44 have watched/voted in higher number. Specifically people between 18 and 29 watched/voted more in numbers.**

- Inference 3: **Sci-Fi genre seems to have higher votes across all age groups. Next to it, genres like Action, Adventure,Thriller are seems to have comparitively higher than ther rest.**  
- Inference 4: **Number of voters in male category are higher than the female voters across all age groups and genres.**


In [None]:
# 2nd set of heat maps for Votes-related columns

f,(ax1,ax2,axcb) = plt.subplots(1,3,gridspec_kw={'width_ratios':[30,30, 1.5]})
ax1.get_shared_y_axes().join(ax1,ax2)
f.suptitle('Gender wise Age group Vs Genres',fontsize = 35, color='Teal' )

a = sns.heatmap(genre_top10.groupby(genre_top10.index)['VotesU18M','Votes1829M', 'Votes3044M', 'Votes45AM'].mean(),cmap="PRGn",annot=True,cbar=False,ax=ax1,linewidths=0.5)
a.axes.set_title('Male age group(Votes) vs Genres', fontsize = 20, color='Orange')
a.set_ylabel('Genres', fontsize = 20, color='Brown')
a.set_xlabel('Male Age Group', fontsize = 20, color='Brown')

b = sns.heatmap(genre_top10.groupby(genre_top10.index)['VotesU18F','Votes1829F', 'Votes3044F', 'Votes45AF'].mean(), cmap="PRGn", annot = True,linewidths=0.5, cbar_ax=axcb, ax=ax2)
b.axes.set_title('Female age group(Votes) vs Genres', fontsize = 20, color='Orange')
b.set_xlabel('Female Age Group', fontsize = 20, color='Brown')
b.set_yticks([])

f.set_figheight(10)
f.set_figwidth(15)

plt.show()

**`Inferences:`** Sci-Fi appears to be the highest rated genre in the age group of U18 for both males and females. Also, females in this age group have rated it a bit higher than the males in the same age group. What more can you infer from the two heatmaps that you have plotted? Write your three inferences/observations below:
- Inference 1: **In most of the cases Age is inversly propotional to the Rating, as Age increases Rating decreases(except some cases like Crime, Romance, Sci-Fi, Thriller in Female age groups) - voters under 18 have given higher rating than the other age groups.**  
- Inference 2: **Crime genre seems to be low rated or least watched by all female age groups. similarly Romance genres seems to be low rated or least watched among both genders of age 30 and above.** 
- Inference 3: **Male age group between 18 and 29 have gives similar rating for all genres (around 8). there is not much fluctuation in rating comparing to corresponding female group.**

-  ###  Subtask 3.4: US vs non-US Cross Analysis

In [None]:
# Creating IFUS column

movies ['IFUS'] = movies['Country'].apply(lambda x: 'USA' if x =='USA' else 'non-USA')
movies.head()

In [None]:
# Box plot - 1: CVotesUS(y) vs IFUS(x)

fig, axes =plt.subplots(nrows=1, ncols=2)
fig.suptitle('IFUS Vs CVotesUS',fontsize = 35, color='Purple' )

a= sns.boxplot(x='IFUS', y = 'CVotesUS', data = movies, ax = axes[0])
a.axes.set_title('USA People Vote Distribution', fontsize = 22, color='Green')
a.set_ylabel('CVotesUS', fontsize=20, color='Brown')
a.set_xlabel('IFUS', fontsize=20,color='Brown')

b= sns.boxplot(x='IFUS', y = 'CVotesnUS', data = movies, ax = axes[1])
b.axes.set_title('Non-USA People Vote Distribution', fontsize = 22, color='Green')
b.set_ylabel('CVotesnUS', fontsize=20,color='Brown')
b.set_xlabel('IFUS', fontsize=20,color='Brown')

fig.set_figheight(10)
fig.set_figwidth(15)

plt.show()

**`Inferences:`** Write your two inferences/observations below:
- Inference 1: **For both the movie orgins, the non-USA people have voted more in numbers than the USA people on average.**
- Inference 2: **People from USA have voted more for the USA movies compared to non-USA movies.** 

In [None]:
# Box plot - 2: VotesUS(y) vs IFUS(x)

fig, axes =plt.subplots(nrows=1, ncols=2)
fig.suptitle('IFUS Vs VotesUS',fontsize = 35, color='Purple' )

a= sns.boxplot(x='IFUS', y = 'VotesUS', data = movies,ax = axes[0])
a.axes.set_title('USA People Vote Distribution', fontsize = 22, color='Green')
a.set_ylabel('VotesUS', fontsize=20, color= 'Brown')
a.set_xlabel('IFUS', fontsize=20, color= 'Brown')

b= sns.boxplot(x='IFUS', y = 'VotesnUS', data = movies, ax = axes[1])
b.axes.set_title('Non-USA People Vote Distribution', fontsize = 22, color='Green')
b.set_ylabel('VotesnUS', fontsize=20, color= 'Brown')
b.set_xlabel('IFUS', fontsize=20, color= 'Brown')

fig.set_figheight(10)
fig.set_figwidth(15)

plt.show()

**`Inferences:`** Write your two inferences/observations below:
- Inference 1: **Movies with USA orgin are rated higher on average by both USA and non-USA People whereas the non-USA movies rated lower in both the plots on average.**
- Inference 2: **Generally people from USA gave higher rating to both movie types by comparing the medians and their 25th percentile is 7.8 whereas 25th percentile of non-USA people votes is 7.6.**

-  ###  Subtask 3.5:  Top 1000 Voters Vs Genres

In [None]:
# Sorting by CVotes1000

genre_top10.sort_values(by='CVotes1000', ascending = False, inplace=True)
genre_top10

In [None]:
# Bar plot

plt.figure(figsize=[15,8])
sns.barplot(x = genre_top10.index, y = genre_top10.CVotes1000).set_title('Top 1000 Voters Vs Genres', fontsize = 30, color = 'Green')

plt.xlabel('Movies genres', fontsize = 20,color ='Brown')
plt.ylabel('CVotes1000', fontsize = 20,color ='Brown')

plt.show()

**`Inferences:`** Write your inferences/observations here.
##### 1) The top 1000 voters liked Sci-fi movies and the least prefered is Romance.
##### 2) Despite of having more Drama movies in top 100, it has low votes compared to other genres moves 
##### 3) Sci-Fi movies are most prefered among people among top 1000 voters , male of different age groups and female of different age groups . 
##### 4) Drama, Animation and Romance are very low compared to Sci-Fi.