In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import altair as alt
import re
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
df = pd.read_csv('../input/bollywood-movies-19502019/bollywood_full.csv')

Dataset has some columns which are not useful for analysis. We will drop them, also we will rename columns for ease of use later on.

In [None]:
# Droping columns which i will not be using for analysis
df = df.drop(['title_x','title_y', 'imdb_id', 'poster_path', 'wiki_link', 'is_adult', 'tagline', 'release_date'], axis=1)

# Renaming for ease of reference
df = df.rename(columns={'original_title': 'title', 'year_of_release':'year', 'imdb_rating':'rating', 'imdb_votes':'votes', 'wins_nominations':'awards'})

**Cleaning process**

- Awards columns has both awards and nominations. We will separate them to wins and nominations and clean and strip them to be useful for analysis later on.
- Runtime columns has some movies for which duration is not recorded we will forward fill them. This is not the right way to fill but finding movie duration for each missing entry is not worth it. There are better ways to impute missing values. Please try if you feel like.
- Actors column has list of all the actors from the movie. We are going to take the first actor from each entry as we will only explore dataset in regards to lead actor.
- Genres columns has all the genres pipe seperated to which movie might belong. We are going to take first genre that describe movie as main genre. Rest we will discard. Also, there we few genres which had very low count, we will not use those in analysis.
- Lastly we will drop duplicates from the dataset

In [None]:
# Data cleaning
df['awards'] = df['awards'].apply(lambda x: re.sub(r'[A-Za-z]', '', str(x)))
df[['wins', 'nominations']] = df.awards.str.split('&', expand=True)
df['wins'] = df['wins'].str.strip()
df['wins'] = df['wins'].apply(lambda x: 0 if x=='' else x)
df['wins'] = df['wins'].replace(np.nan, 0)
df['wins'] = df['wins'].astype(int)

df['runtime'] = df['runtime'].replace('\\N', np.nan)
df['runtime'] = df['runtime'].ffill()
df['runtime'] = df['runtime'].astype(int)
df = df[df['runtime'] >= 60]

actors = df.actors.str.split('|', expand=True)
df['lead_actor'] = actors[0]

df[['genre','genre2', 'genre3']] = df.genres.str.split("|",expand=True)
df = df.drop(['genres', 'awards', 'actors', 'genre2', 'genre3', 'nominations'], axis=1)
minor_genre_list = ['Animation','Documentary','History','Music','War','Sci-Fi','Sport']
df = df[~df['genre'].isin(minor_genre_list)]

df = df.drop_duplicates()

### Let’s look at the movies released in each year

In [None]:
alt.Chart(df, title='Bollywood movies released over the years').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('id:O', axis=None, sort='descending'),
    tooltip=['title', 'year', 'rating'],
    color='genre'
).properties(height=600, width=800)

It looks like Drama has dominated bollywood. But, Action movies have gained momentum since 1980 and comedy from early 2000. Also, it looks like thrillers are in vogue right now.

### Let’s look at how many movies of each genre were made each year

In [None]:
temp1 = df.groupby(['genre', 'year']).size().reset_index(name='count')
alt.Chart(temp1, title='Bollywood movies genre over the years').mark_bar(opacity=0.8).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1955, 2015])),
    alt.Y('count', stack='zero'),
    color='genre',
    tooltip=['genre']
).properties(height=500, width=800)

We can see from the chart above that till 1970s drama dominated bollywood. But, after that action movies started gaining momentum, thanks to angry young man Amitabh Bachchan. Action movies dominated bollywood from 1980 till 2000. I feel 80s till 2000 was action era of bollywood. Actors such as Akshay kumar, Ajay devgan, Sunil shetty, Sunny deol, Sanjay dutt were dominating the bollywood scene. From early 2000 comedy movies started gaining catching up to drama and action. From 2000 till 2010 some of the best comedy movies were made like hera pheri. Romantic movies count have been low compared to drama, action or comedy but for few brief period hardly any romantic movies were made. Romantic movie although low in count have been big success in bollywood.

### Let’s explore the movies that won awards in bollywood

In [None]:
winners = df[df['wins']!=0]
alt.Chart(winners, title='Award winning movies over the years').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('id:O', axis=None, sort='descending'),
    tooltip=['title', 'year'],
    color='genre'
).properties(height=600, width=800)

Wow, Drama movies dominated the award scene till late 80s. But, since 1990s we are seeing health mix of genre in awards. But, we can clearly see that bollywood has started giving too many awards since 2000.

### Does movie that win multiple records also have high ratings

We will only look at the movies that have won more than 5 awards

In [None]:
alt.Chart(winners[winners['wins']>5], title='Multiple award winning movies over the years').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('rating', scale=alt.Scale(type='linear', domain=[0, 10])),
    tooltip=['title', 'year'],
    color='genre'
).properties(height=600, width=800)

But, What about votes? Does the movie that have higher rating also have most votes?

### Let’s look at the movies that have won more than 5 awards with their rating and votes

In [None]:
alt.Chart(winners[winners['wins']>=5], title='Movies that won more than 5 award with rating and votes').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('votes', scale=alt.Scale(type='log', domain=[10, 1000000])),
    tooltip=['title', 'year', 'votes'],
    color='genre'
).properties(height=600, width=800)

It seems that lot of movies from early bollywood era haven’t been rate by many user. Handful of bollywood movies have more than 100000 ratings. 3 idios outclassing every other movie with whopping 300K votes.

### Who is the actor with multiple appeearence in award winning movies? Guess?

I am big fan of Big B and King khan. Here we will find out lead actors who appeared in award winning movies.

In [None]:
temp2 = winners.lead_actor.value_counts().head(10).rename_axis('actor').reset_index(name='appearences')
alt.Chart(temp2, title='Lead actor with most appearnces in award winning movies').mark_bar().encode(
    alt.X('actor', axis=alt.Axis(labelAngle=-45)),
    alt.Y("appearences")
).properties(width=600)

Great, Big B would have been obvious choice considering Big B’s career span over five decades. Rajesh Khanna aka kaka came second with 40 award winning movies in short career span. For a brief period in bollywood kaka was the biggest star. king khan came fourth right after Ajay devgan in giving award winning movies.

### How good are movies of lead actors of bollywood?
We will look at the award winning movies of leading bollywood actors across time and see how they were rate by public.

In [None]:
temp3 = winners[winners['lead_actor'].isin(temp2.actor.tolist())]
alt.Chart(temp3, title='Award winning Movies of Top actors').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('rating', scale=alt.Scale(type='linear', domain=[0, 10])),
    tooltip=['title', 'year', 'rating','genre'],
    color='lead_actor'
).properties(height=600, width=800)

Interesting! Lot of bad movies have started winning awards in hollywood in recent years. Going as low as humshakals and himmatwala each of which have close to two rating and still won awards.

### Who is the most successful actor in bollywood?

Above analysis begs the question who is most successful lead actor in bollywood. We will calculate percentage of award winning movies from total movies done by leaading bollywood actors

In [None]:
movies_done_by_top_actors = df[df['lead_actor'].isin(temp3.lead_actor.tolist())]
success_percentage = (temp3.lead_actor.value_counts() / movies_done_by_top_actors.lead_actor.value_counts() * 100).rename_axis('actor').reset_index(name='% success')
alt.Chart(success_percentage, title='What is success percentage of top actors').mark_bar().encode(
    alt.X('actor', axis=alt.Axis(labelAngle=-45)),
    alt.Y("% success")
).properties(width=600)

Expected. Shah Rukh Khan is known as king khan for a reason with highest 80% success rate.

### Are movies becoming shorter?

Let’s look at how the average movie length changed in bollywood over period of time.

In [None]:
temp4 = df.groupby(['year'])['runtime'].mean().reset_index(name='avg_runtime')
alt.Chart(temp4, title='Average runtime of movie over the years').mark_area(
    line={'color':'darkblue'},
    color=alt.Gradient(
        gradient='linear',
        stops=[alt.GradientStop(color='white', offset=0),
               alt.GradientStop(color='darkblue', offset=1)],
        x1=1,
        x2=1,
        y1=1,
        y2=0
    )
).encode(
    alt.X('year'),
    alt.Y('avg_runtime')
).properties(height=300, width=800)

Ok. It seems movie length have decreased by sweet 20-30 mins in five decades.



### Does movie length impact its rating?
Is there a sweet spot of movie runtime that impact the ratings. Do the directors know that?

In [None]:
alt.Chart(df, title='How rating varies with runtime').mark_circle().encode(
    alt.X('runtime', bin=alt.Bin(maxbins=20), scale=alt.Scale(type='linear', domain=[60, 220])),
    alt.Y('rating', bin=alt.Bin(maxbins=10)),
    size='count()'
).properties(height=300, width=800)

It seems bollywood has consensus on movie length. Most of the movie falls between 120-140 mins.

### Can you guess highest rated bollywood movies since 1950 till 2019?
Let’s look at highest rated movie each year by the genre.

In [None]:
temp5 = df.groupby(['year'])[['title','genre', 'rating']].max().reset_index()
alt.Chart(temp5, title='Highest rates movies of all years').mark_circle(opacity=0.8).transform_window(
    id='rank()',
    groupby=['year']
).encode(
    alt.X('year', scale=alt.Scale(type='linear', domain=[1950, 2020])),
    alt.Y('rating', scale=alt.Scale(type='linear', domain=[0, 10])),
    tooltip=['title', 'year', 'rating','genre'],
    color='genre'
).properties(height=200, width=800)

Wow, Although Drama dominated movies in till 1980 but highest rated movies are romantic for those years. I guess some really good romantic gems are hidden there. Also, action dominated bollywood since 1990s but ratings show that people love dramas more now.

### Can we guess movie genre from word cloud of summary?
Let’s try to see what word cloud of summary of movie in each genre tell us.

In [None]:
temp6 = df.groupby(['genre'])['summary']

x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

for name, group in temp6:
    text = ' '.join(group.tolist())
    wordcloud = WordCloud(width=500, height=500, margin=0,background_color ='white',
                          stopwords = stopwords, mask=mask)
    wordcloud = wordcloud.generate(text)
    print(name)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.margins(x=0, y=0)
    plt.show()

It seems movie summary are good descriptiors of type of genre.

### Can we recommend similar movie based on it’s plot?

Let’s try to build a basic content based recommender that recommends movie with similar story lines. We will create tfidf vectors from movie story lines and calculate cosine similarities between storylines.

In [None]:
# we are going to use cosine similarity to find movies with similar plot
movie_recommend_df = df[['title', 'story']]
movie_recommend_df = movie_recommend_df.dropna(subset=['story'])
movie_recommend_df = movie_recommend_df.reset_index(level=0)
movie_recommend_df['id'] = movie_recommend_df['index']
movie_recommend_df = movie_recommend_df.drop(['index'], axis=1)

tf = TfidfVectorizer(analyzer='word', min_df=1, max_features= 200 ,stop_words='english', lowercase=True)
tfidf_matrix = tf.fit_transform(movie_recommend_df['story'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {}

for idx, row in movie_recommend_df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-50:-1]
    similar_items = [(cosine_similarities[idx][i], movie_recommend_df['id'][i]) for i in similar_indices]
    results[row['id']] = similar_items[1:]

In [None]:
def item(id):
    return movie_recommend_df.loc[movie_recommend_df['id'] == id]['title']

def recommend(id, num):
    print("Recommending " + str(num) + " movie similar to " + item(id))
    recs = results[id][:num]
    i=0
    for rec in recs:
        print("We recommend : " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
recommend(1787,3)

Wow, even this naive recommender is able to recommend movie with good confidence. Guess, I will take help from this reommender to watch few flick in future.

Phew! That was some exploration. I have you have enjoyed the analysis as much as i did. Now, you know who to chat with for movie related conundrums.

**Don't forget to upvote the notebook** 