In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print('Priyatama is ready!')

In [None]:
data = pd.read_csv('../input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv')
data.head(2)

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data[data.duplicated()]

In [None]:
sns.heatmap(data.isnull())

In [None]:
#Replace all "," in a column. 
data['Gross'] = data['Gross'].replace({',': ''}, regex=True)

In [None]:
#Replace all NA in a single column.
data['Gross'] = data['Gross'].replace(np.nan, 0)

In [None]:
data.Gross.dtypes

In [None]:
data['Gross'] = data['Gross'].astype(int)

In [None]:
#Plot a barplot to show top 7 movies by descending order of votes.

#Make a new dataframe by sorting it in descending order of No of votes.

top_voted = data.sort_values(['No_of_Votes'], ascending = False)

In [None]:
fig,ax=plt.subplots(figsize=(15,5))
ax.set(facecolor = 'Black')
sns.barplot(x=top_voted['Series_Title'][:7], y=top_voted['No_of_Votes'][:7], palette = 'hls')
plt.title('7 Top Voted Movies', fontweight = 'bold', fontsize = 15)
plt.xlabel('Movies', fontsize = 10, fontweight = 'bold')
plt.ylabel('Votes', fontsize = 10, fontweight = 'bold')
plt.show()

In [None]:
#Plot a barplot to show top 7 movies by descending order of Gross.

top_gross = data.sort_values(['Gross'], ascending = False)

In [None]:
fig, ax = plt.subplots(figsize = (23,5))
ax.set(facecolor='black')
plt.title('Top 7 Gross Movies', fontsize = 15, fontweight = 'bold')
plt.xlabel('Movies', fontsize = 10, fontweight = 'bold')
plt.ylabel ('Gross', fontsize = 10, fontweight = 'bold')
sns.barplot(x=top_gross['Series_Title'][:7], y=top_gross['Gross'][:7], palette = 'hls')

In [None]:
#Plot a barplot for top 7 gross directors.

#Create a dataset of groupby total gross per director.

top_gross_dir = data.groupby(['Director'])['No_of_Votes'].sum().reset_index()
top_gross_dir.columns = ['Director', 'Gross']

In [None]:
#New dataset with top 7 gross directors.
top_7_rated_dir = top_rated_dir.sort_values(by = 'Gross', ascending = False).head(7).reset_index().drop('index', axis=1)
top_7_rated_dir

In [None]:
fig, ax=plt.subplots(figsize = (15,5))
ax.set(facecolor = 'black')
sns.barplot(x=top_7_rated_dir['Director'], y=top_7_rated_dir['Gross'], palette= 'hls')
plt.xlabel('Director', fontsize = 10, fontweight = 'bold')
plt.ylabel('Gross Collection', fontsize =10, fontweight='bold')
plt.title('Top 7 Grossing Directors', fontsize = 15, fontweight = 'bold')
plt.show()

In [None]:
data.columns

In [None]:
#Movies released in a year.
fig,ax = plt.subplots(figsize=(30,7))
ax.set(facecolor = 'black')
sns.countplot(x=data['Released_Year'], order = data['Released_Year'].value_counts().index,palette='hls' )
plt.xticks(rotation = 90)
plt.xlabel('Movies', fontsize= 10, fontweight = 'bold')
plt.ylabel('Years', fontsize= 10, fontweight = 'bold')
plt.title('Movies by Year', fontsize = 15, fontweight = 'bold')
plt.show()

In [None]:
#For 'Iron Man',what is the Poster link and who is the Director of the movie ?
data[data['Series_Title'].str.contains('Iron Man')]

In [None]:
#Show with Bar Graph the most certificates.
fig, ax=plt.subplots(figsize=(10,6))
ax.set(facecolor='black')
sns.countplot(x=data['Certificate'], palette='hls', order = data['Certificate'].value_counts().index)
plt.title('Movies by Certification', fontsize = 15, fontweight = 'bold')
plt.xlabel('Certificate')
plt.xticks(rotation = 45)
plt.ylabel('Movies')
plt.show

In [None]:
#Show Top 10 Directors, who gave the highest number of Movies.

data['Director'].value_counts().head(10)

In [None]:
#Show all movies of top 1 Director.

data[(data['Director']=='Alfred Hitchcock')]['Series_Title']

In [None]:
data.columns

In [None]:
#Stars with most Occurences in movies
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(20,7))
ax=0
for x in stars:
    axs[ax].bar(data[x].value_counts()[:10].index,data[x].value_counts()[:10])
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()

In [None]:
#Top 10 genres.
#Make a new empty list and append it with all words in genre strip by "," and make it in lower case.
genre=[]
for x in data['Genre']:
    for y in x.split(','):
        genre.append(y.strip().lower())

In [None]:
#To count occurances of a word in a list.
from collections import Counter
count=Counter(genre)
count

In [None]:
#Sort the count, to know top 10 for example put [:10] as below.
count=count.most_common() #[:10]
count

In [None]:
#Convert list into Dataframe.
top_genre = pd.DataFrame(count, columns=['Genre', 'Count'])
top_genre.head()

In [None]:
fig, ax=plt.subplots(figsize=(10,6))
ax.set(facecolor = 'black')
sns.barplot(x=top_genre['Count'],y=top_genre['Genre'], palette='hls')
plt.xlabel('Movies', fontsize = 10, fontweight = 'bold')
plt.ylabel('Genre', fontsize=10, fontweight='bold')
plt.title('Top Genre', fontsize = 15, fontweight='bold')
plt.show

In [None]:
data.columns

In [None]:
#Make a dataframe of top 7 rated movies.
top_rated_movie = data.groupby(['Series_Title'])['IMDB_Rating'].sum().reset_index()
top_rated_movie.columns = ['Movie', 'Rating']
top_rated_movie = top_rated_movie.sort_values(by = 'Rating', ascending = False).head(7).reset_index().drop('index', axis=1)
top_rated_movie