In [1]:
#Importing libraries 
import pandas as pd
import numpy as np

In [2]:
# Importing file
basic = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
rating = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
# reading the csv file
basics = pd.read_csv(basic,sep='\t', low_memory=False)

In [4]:
# reading the csv file
aka = pd.read_csv(akas,sep='\t', low_memory=False)

In [5]:
# reading the csv file
rate = pd.read_csv(rating,sep='\t', low_memory=False)

## Cleaning and Handeling duplicate values - Title Basics

In [6]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9169391 entries, 0 to 9169390
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 629.6+ MB


In [7]:
# Checking for duplicate values
basics.duplicated().sum()

0

In [8]:
# Checking null values
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      10
originalTitle     10
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [9]:
# Replacing \N with na
basics.replace({'\\N':np.nan}, inplace=True)

In [10]:
# Checking null values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           10
originalTitle          10
isAdult                 1
startYear         1213610
endYear           9073997
runtimeMinutes    6705858
genres             416842
dtype: int64

In [11]:
# Eliminate movies that are null for runtimeMinutes
basics.dropna(subset=['runtimeMinutes'],inplace=True)

In [12]:
# Checking null values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37657
endYear           2416909
runtimeMinutes          0
genres              67320
dtype: int64

In [13]:
# Eliminate movies that are null for genre
basics.dropna(subset=['genres'],inplace=True)

In [14]:
# Checking null values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36314
endYear           2351155
runtimeMinutes          0
genres                  0
dtype: int64

In [15]:
newdf = basics.loc[basics['titleType'] == 'movie']

In [16]:
newdf['startYear'].value_counts()

2017    14186
2018    14131
2016    13819
2019    13798
2015    13324
        ...  
1894        1
1899        1
1904        1
1906        1
1896        1
Name: startYear, Length: 129, dtype: int64

In [17]:
newdf1 = newdf.loc[(newdf['startYear'] == '2000') 
                   | (newdf['startYear'] == '2001')
                   | (newdf['startYear'] == '2002')
                   | (newdf['startYear'] == '2003')
                   | (newdf['startYear'] == '2004')
                   | (newdf['startYear'] == '2005')
                   | (newdf['startYear'] == '2006')
                   | (newdf['startYear'] == '2007')
                   | (newdf['startYear'] == '2008')
                   | (newdf['startYear'] == '2009')
                   | (newdf['startYear'] == '2010')
                   | (newdf['startYear'] == '2011')
                   | (newdf['startYear'] == '2012')
                   | (newdf['startYear'] == '2013')
                   | (newdf['startYear'] == '2014')
                   | (newdf['startYear'] == '2015')
                   | (newdf['startYear'] == '2016')
                   | (newdf['startYear'] == '2017')
                   | (newdf['startYear'] == '2018')
                   | (newdf['startYear'] == '2019')
                   | (newdf['startYear'] == '2020')
                   | (newdf['startYear'] == '2021')
                   | (newdf['startYear'] == '2022')]

In [18]:
newdf1['startYear'].value_counts()

2017    14186
2018    14131
2016    13819
2019    13798
2015    13324
2014    12981
2013    12272
2021    11711
2012    11540
2020    11253
2011    10680
2010    10121
2009     9268
2008     8069
2022     7654
2007     6885
2006     6428
2005     5761
2004     5129
2003     4526
2002     4085
2001     3813
2000     3591
Name: startYear, dtype: int64

In [19]:
newdf1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215025 entries, 13079 to 9169341
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          215025 non-null  object
 1   titleType       215025 non-null  object
 2   primaryTitle    215025 non-null  object
 3   originalTitle   215025 non-null  object
 4   isAdult         215025 non-null  object
 5   startYear       215025 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  215025 non-null  object
 8   genres          215025 non-null  object
dtypes: object(9)
memory usage: 16.4+ MB


In [20]:
# Now we can change the startYear column to int
newdf1['startYear'] = newdf1['startYear'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf1['startYear'] = newdf1['startYear'].astype(int)


In [21]:
newdf1['genres'].value_counts()

Documentary                    50766
Drama                          34722
Comedy                         13098
Comedy,Drama                    6250
Horror                          5598
                               ...  
Family,Musical,Sport               1
Comedy,Game-Show                   1
Horror,Music,Mystery               1
Documentary,Sci-Fi,Thriller        1
Crime,Fantasy,Sci-Fi               1
Name: genres, Length: 1186, dtype: int64

In [22]:
newdf1.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int32
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [23]:
# Exclude movies that are included in the documentary category.
is_documentary = newdf1['genres'].str.contains('documentary',case=False)
newdf1 = newdf1[~is_documentary]


In [24]:
newdf1['genres'].value_counts()

Drama                        34722
Comedy                       13098
Comedy,Drama                  6250
Horror                        5598
Drama,Romance                 4163
                             ...  
Biography,Family,Mystery         1
Biography,Music,Mystery          1
Crime,Music,Mystery              1
Comedy,Reality-TV,Romance        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 967, dtype: int64

## Cleaning and Handeling duplicate values - AKA

In [25]:
aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32923519 entries, 0 to 32923518
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


In [26]:
akadf = aka.loc[aka['region'] == 'US']

In [27]:
akadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344052 entries, 5 to 32923263
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1344052 non-null  object
 1   ordering         1344052 non-null  int64 
 2   title            1344052 non-null  object
 3   region           1344052 non-null  object
 4   language         1344052 non-null  object
 5   types            1344052 non-null  object
 6   attributes       1344052 non-null  object
 7   isOriginalTitle  1344052 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.3+ MB


In [28]:
# Checking null values
akadf.isna().sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

In [29]:
# Replacing \N with na
akadf.replace({'\\N':np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akadf.replace({'\\N':np.nan}, inplace=True)


In [30]:
# Checking null values
akadf.isna().sum()

titleId                  0
ordering                 0
title                    0
region                   0
language           1340366
types               380487
attributes         1299257
isOriginalTitle       1375
dtype: int64

## Cleaning and Handeling duplicate values - Ratings

In [31]:
# Replacing \N with na
rate.replace({'\\N':np.nan}, inplace=True)

In [32]:
rate.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

## Filtering one dataframe based on another

In [33]:
## Filtering basic dataframe based on aka dataframe

keepers =newdf1['tconst'].isin(akadf['titleId'])
keepers

34790       True
61090       True
67636       True
77930      False
86767       True
           ...  
9169063     True
9169072     True
9169111    False
9169156     True
9169240    False
Name: tconst, Length: 142337, dtype: bool

In [34]:
## Filtering rate dataframe based on aka dataframe

keepers1 =rate['tconst'].isin(akadf['titleId'])
keepers1

0           True
1           True
2          False
3          False
4           True
           ...  
1256529    False
1256530     True
1256531    False
1256532    False
1256533    False
Name: tconst, Length: 1256534, dtype: bool

In [35]:
# Filtered basic dataframe
newdf1 = newdf1[keepers]
newdf1


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
92732,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
...,...,...,...,...,...,...,...,...,...
9168527,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9168923,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9169063,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9169072,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [36]:
# Filtered basic dataframe
rate1 = rate[keepers1]
rate1


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
4,tt0000005,6.2,2519
5,tt0000006,5.1,173
6,tt0000007,5.4,783
...,...,...,...
1256504,tt9916204,8.1,242
1256511,tt9916348,8.5,17
1256512,tt9916362,6.4,4835
1256516,tt9916428,3.8,14


In [39]:
# basics table info
newdf1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82250 entries, 34790 to 9169156
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82250 non-null  object
 1   titleType       82250 non-null  object
 2   primaryTitle    82250 non-null  object
 3   originalTitle   82250 non-null  object
 4   isAdult         82250 non-null  object
 5   startYear       82250 non-null  int32 
 6   endYear         0 non-null      object
 7   runtimeMinutes  82250 non-null  object
 8   genres          82250 non-null  object
dtypes: int32(1), object(8)
memory usage: 6.0+ MB


In [40]:
# Lets look at datatypes
newdf1.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int32
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [41]:
# aka table info

akadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344052 entries, 5 to 32923263
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1344052 non-null  object
 1   ordering         1344052 non-null  int64 
 2   title            1344052 non-null  object
 3   region           1344052 non-null  object
 4   language         3686 non-null     object
 5   types            963565 non-null   object
 6   attributes       44795 non-null    object
 7   isOriginalTitle  1342677 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.3+ MB


In [42]:
# Lets look at datatypes
akadf.dtypes

titleId            object
ordering            int64
title              object
region             object
language           object
types              object
attributes         object
isOriginalTitle    object
dtype: object

In [43]:
# rate table info
rate1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474688 entries, 0 to 1256530
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         474688 non-null  object 
 1   averageRating  474688 non-null  float64
 2   numVotes       474688 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [47]:
# Lets look at datatypes
rate1.dtypes

tconst            object
averageRating    float64
numVotes           int64
dtype: object

In [45]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


[]

In [46]:
## Save current dataframe to file.
newdf1.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [48]:
## Save current dataframe to file.
akadf.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [49]:
## Save current dataframe to file.
rate1.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [50]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
