In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import datetime as dt
import warnings
import plotly.graph_objects as go
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# 1. Extracting

In [112]:
# Loading data
data = pd.read_csv('datasets/imbd/imdb_top_1000.csv')

In [113]:
# Data information
data.info()
#print(data.describe())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [114]:

# Looking for null values
data.isnull().sum()
# A few null values have been found

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

# 2. Transform

In [115]:
# Filling null values with the most appeared
data['Certificate'] = data['Certificate'].fillna(data['Certificate'].mode()[0])
# Filling null values with the avg
data['Meta_score'].fillna(data['Meta_score'].mean(),inplace=True)

# Converting to int to deal with data
data["Gross"] = data["Gross"].str.replace(",","")
data["Gross"] = data["Gross"].replace(np.nan, 0)
data["Gross"] = data["Gross"].astype(int)
# Filling null values with the avg
data["Gross"] = data["Gross"].replace(0,data['Gross'].mean())

data.isnull().sum()

Poster_Link      0
Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

In [116]:
# Converting "Runtime" to int type
data['Runtime'] = data['Runtime'].str.replace(' min', '').astype(int)
# Dropping unnecessary columns
data.drop(['Poster_Link','Overview'],axis=1,inplace=True)
# Dropping misc data
data = data.drop(data[data['Released_Year'] == 'PG'].index)
# Renaming
data = data.rename(columns={"Series_Title": "Movies_Title"})
data.head()

Unnamed: 0,Movies_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,The Godfather,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,12 Angry Men,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


## 3. Data statistics

In [117]:
# Data STATISTICS
# Correlation between atributes
#print(data.corr())
fig = px.imshow(data.corr(),text_auto=True,height=600,width=600,template='ggplot2',aspect='auto',title='Correlation of attributes', color_continuous_scale='reds')
fig.show()

## 4. Exploratory Analysis

In [121]:
top_voted = data.sort_values(['No_of_Votes'], ascending = False).head(10)
fig=px.bar(top_voted,x=top_voted['Movies_Title'],y=top_voted['No_of_Votes'], color=top_voted['Genre'], text=top_voted['No_of_Votes'],labels={'Movies_Title':'Movie','No_of_Votes':'Number of votes','text':'count'},template='seaborn',title='Top 10 most voted movies')
fig.show()

In [123]:
top_voted = data.sort_values(['No_of_Votes'], ascending = False).head(40)
fig = px.scatter(top_voted, x="Runtime", y="IMDB_Rating", color="Director", title='Runtime by rating')
fig.show()

In [120]:
# Grouping by year and counting the corresponding movies per year
movies_by_year = data.groupby('Released_Year').size().reset_index(name='count').sort_values(by='Released_Year', ascending=False)

fig = px.bar(movies_by_year, x='Released_Year', y='count', title='Number of movies by year')
fig.show()