 #### Import Necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Load Dataset

In [None]:
actor_rank= pd.read_csv('../input/bollywood-movie-dataset/BollywoodActorRanking.csv')
dir_rank = pd.read_csv('../input/bollywood-movie-dataset/BollywoodDirectorRanking.csv')
movies = pd.read_csv('../input/bollywood-movie-dataset/BollywoodMovieDetail.csv')

### Exploratory Data Analysis and Data Pre- Processing

In [None]:
actor_rank.head()

In [None]:
dir_rank.head()

In [None]:
movies.head()

In [None]:
movies.info()

#### Dealing with missing values

In [None]:
movies.isna().sum()

In [None]:
actor_rank.isna().sum()

In [None]:
dir_rank.isna().sum()

In [None]:
# Replace  ReleaseDate na values with mode
movies['releaseDate'].fillna(movies['releaseDate'].mode()[0],inplace = True)

# Replace  writers na values with mode
movies['writers'].fillna('Unknown',inplace = True)

# drop rest of the na values since the number of na values are low
movies.dropna(inplace = True)

# since there is only 1 missing values on dir_rank dataset we can savely drop the row of missing values

dir_rank.dropna(inplace = True)

# replace mising values with mode on actor_rank dataset

actor_rank['normalizedGoogleRank'].fillna(actor_rank['normalizedGoogleRank'].mode()[0],inplace = True)

actor_rank.dropna(inplace = True)

#### check dataset to verify there are no more missing values

In [None]:
print(movies.isna().sum())
print('****************************')
print(actor_rank.isna().sum())
print('****************************')
print(dir_rank.isna().sum())

### Split actor columns to get only the first and second actor name

In [None]:
# Grab the first and second values of 'actors' columns. In most cases it will be first value will be
# actor and second value will be actress

movies['First_Lead_Actor'] = movies['actors'].str.split('|').str[0]
movies['Second_Lead_Actor'] = movies['actors'].str.split('|').str[1]

# Add a new column month extracting the month  'releaseDate' column
movies['release_month'] = movies['releaseDate'].apply(lambda x: x.lstrip().split(' ')[1])


movies



### Data Visualization

In [None]:
# Top 15 movie genre

plt.figure(figsize = (12,8))
ax = plt.axes()
ax.set(facecolor = 'lightgreen')
ax.grid(False)

sns.countplot(data = movies,x = 'genre',order = movies['genre'].value_counts().index[0:15],palette = 'bone_r')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=15, color='red', ha='center', va='bottom')

plt.xticks(rotation = 90, fontsize = 12)
plt.xlabel('Genre', fontsize = 15,fontweight = 'bold')
plt.ylabel('Count of Total Movies', fontsize = 15,fontweight = 'bold')
plt.title('Top 15 Movie Genre', fontsize = 15,fontweight = 'bold')

plt.show()

In [None]:
# Release year

movies['releaseYear'].value_counts()

plt.figure(figsize = (12,8))
ax = plt.axes()
ax.set(facecolor = 'grey')
ax.grid(False)

sns.countplot(data = movies,x = 'releaseYear',palette = 'Set2')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=15, color='black', ha='center', va='bottom')

plt.xticks(rotation = 90, fontsize = 12)
plt.xlabel('Release Year', fontsize = 15,fontweight = 'bold')
plt.ylabel('Count of movies', fontsize = 15,fontweight = 'bold')
plt.title('Total Movies Released Each Year', fontsize = 15,fontweight = 'bold')

plt.show()

In [None]:
# Release year
plt.figure(figsize = (12,8))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)

sns.countplot(data = movies,x = 'release_month',palette = 'Paired')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=15, color='black', ha='center', va='bottom')

plt.xticks(rotation = 90, fontsize = 12)
plt.xlabel('Release Month', fontsize = 15,fontweight = 'bold')
plt.ylabel('Count of movies', fontsize = 15,fontweight = 'bold')
plt.title('Total Movies Released Each Month', fontsize = 15,fontweight = 'bold')

plt.show()

In [None]:
# First Lead actor


plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)

sns.countplot(data = movies,y= 'First_Lead_Actor',order = movies['First_Lead_Actor'].value_counts().index[0:25],palette = 'inferno')


plt.xlabel('Total Movies', fontsize = 15,fontweight = 'bold')
plt.ylabel('First Lead Actor', fontsize = 15,fontweight = 'bold')
plt.title('Total no of Movies First Lead Actor', fontsize = 15,fontweight = 'bold')

plt.show()


In [None]:
# Second Lead actor


plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)

sns.countplot(data = movies,y= 'Second_Lead_Actor',order = movies['Second_Lead_Actor'].value_counts().index[0:25],palette = 'inferno')

plt.xticks(rotation = 90, fontsize = 12)
plt.xlabel('Total Movies', fontsize = 15,fontweight = 'bold')
plt.ylabel('Second Lead Actor', fontsize = 15,fontweight = 'bold')
plt.title('Total no of Movies Second Lead Actor', fontsize = 15,fontweight = 'bold')

plt.show()

### Actor with most number of movies

In [None]:
totalmovies = actor_rank[['actorName','movieCount']].sort_values(by = 'movieCount',ascending = False)[0:25]
totalmovies

plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)


sns.barplot(data = totalmovies,x = 'movieCount',y = 'actorName',palette = 'Paired')


plt.xlabel('Total Movies', fontsize = 15,fontweight = 'bold')
plt.ylabel('Actor Name', fontsize = 15,fontweight = 'bold')
plt.title('Top 15 actor with most no of movies', fontsize = 15,fontweight = 'bold')

plt.show()


### Actor with High Ratings

In [None]:
movierank = actor_rank[['actorName','normalizedRating']].sort_values(by = 'normalizedRating',ascending = False)[0:15]

plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)

sns.barplot(data = movierank,x = 'actorName',y = 'normalizedRating',palette = 'flare')
plt.xticks(rotation = 90, fontsize = 12)
plt.xlabel('Actor Name',fontsize = 15,fontweight = 'bold')
plt.ylabel('Ratings',fontsize = 15,fontweight = 'bold')
plt.title('Actors with High Ratings',fontsize = 15,fontweight = 'bold')
plt.show()

### Popular actor on internet

In [None]:
googlehits = actor_rank[['actorName','normalizedGoogleRank']].sort_values(by = 'normalizedGoogleRank',ascending = False)[0:15]

googlehits

plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)

sns.barplot(data = googlehits,x = 'normalizedGoogleRank',y = 'actorName',palette = 'PiYG')
plt.xticks(fontsize = 12)
plt.xlabel('Ratings',fontsize = 15,fontweight = 'bold')
plt.ylabel('Actor Name',fontsize = 15,fontweight = 'bold')
plt.title('Most popular Actor on Internet',fontsize = 15,fontweight = 'bold')
plt.show()

### Number of movies released each year

In [None]:

movie_each_year = movies['releaseYear'].value_counts().reset_index()


plt.figure(figsize = (15,10))
ax = plt.axes()
ax.set(facecolor = 'lightyellow')
ax.grid(False)
sns.lineplot(data = movie_each_year,x = 'index',y = 'releaseYear',marker = 'o',color = 'red', linewidth =3.5, palette = 'CMRmap')
plt.xlabel('Year',fontsize = 15,fontweight = 'bold')
plt.ylabel('Movie Count',fontsize = 15,fontweight = 'bold')
plt.title('Number of Movie Released Each Year',fontsize = 15,fontweight = 'bold')
plt.show()

## TO BE CONTINUED........