<a href="https://colab.research.google.com/github/starkjones/Project-3/blob/main/Project_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import libraries:

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [14]:
# Load Data:

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
# akas_url ="https://datasets.imdbws.com/title.akas.tsv.gz"

# DataFrame: 

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

basics.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# **Preprocessing**

## **Filtering/Cleaning Steps:**

###**Title Basics:**

1. Replace "\N" with np.nan

2. Eliminate movies that are null for runtimeMinutes

3. Eliminate movies that are null for genre keep only titleType==Movie

4. keep startYear 2000-2022

5. Eliminate movies that include "Documentary" in genre (see tip below)

In [32]:
# Replace "\N" with np.nan

basics.replace({'\\N':np.nan}, inplace =True) 

In [33]:
basics['runtimeMinutes'].values

array(['1', '5', '4', ..., nan, '27', '10'], dtype=object)

In [34]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9018777 entries, 0 to 9018776
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 688.1+ MB


In [35]:
# Eliminate movies that are null for runtimeMinutes

basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics['runtimeMinutes'].value_counts()

30      127627
60      101452
22       91823
44       68622
45       57304
         ...  
1845         1
1445         1
1174         1
497          1
2088         1
Name: runtimeMinutes, Length: 869, dtype: int64

In [36]:
basics['runtimeMinutes'].isna().sum()

0

In [37]:
# Eliminate movies that are null for genre:

basics.dropna(subset=['genres'], inplace=True)

basics['genres'].value_counts()

Drama                      165194
Documentary                162347
Comedy                     136561
Drama,Short                107197
Short                      101265
                            ...  
Crime,Sport                     1
Adventure,Sci-Fi,Sport          1
Family,Short,Talk-Show          1
Action,Family,History           1
Drama,Horror,Reality-TV         1
Name: genres, Length: 2194, dtype: int64

In [38]:
# Keep only titleType==Movie: 

movie = basics['titleType'] == 'movie'

basics = basics[movie]

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


In [39]:
basics['startYear'].value_counts()

2017    14169
2018    14075
2016    13803
2019    13659
2015    13316
        ...  
1906        1
1903        1
1899        1
1904        1
1896        1
Name: startYear, Length: 128, dtype: int64

In [42]:
# keep startYear 2000-2022:

# Conversion to date time 

basics['startYear'] =pd.to_datetime(basics['startYear'])


start_date = '2000'
end_date = '2023'

daterange = (basics['startYear'] >= start_date) & (basics['startYear'] <= end_date)

basics = basics.loc[daterange]

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021-01-01,,133,Documentary
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001-01-01,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020-01-01,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018-01-01,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001-01-01,,140,"Drama,War"


In [44]:
# Eliminate movies that include "Documentary" in genre (see tip below):

documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary]

basics.head()

basics['genres'].value_counts()

Drama                      34287
Comedy                     12954
Comedy,Drama                6194
Horror                      5525
Drama,Romance               4113
                           ...  
Family,Musical,Sport           1
Comedy,Game-Show               1
Horror,Music,Mystery           1
Adventure,History,Music        1
Action,History,Western         1
Name: genres, Length: 961, dtype: int64

## **Filtering/Cleaning Steps:**

###**AKAs:**

1. keep only US entries.

2. Replace "\N" with np.nan

In [3]:
# Load Data:

akas_url ="https://datasets.imdbws.com/title.akas.tsv.gz"

# DataFrame: 

akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

akas.head(5)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [6]:
# Keep only US entires:

us = akas['region'] == 'US'

akas = akas[us]

akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [8]:
# Replace "\N" with np.nan: 

akas.replace({'\\N':np.nan}, inplace =True) 

akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


## **Filtering/Cleaning Steps:**

###**Ratings:**

1. Replace "\N" with np.nan (if any)

In [9]:
# Load Data:

rating_url ="https://datasets.imdbws.com/title.ratings.tsv.gz"

# DataFrame: 
ratings = pd.read_csv(rating_url, sep='\t', low_memory=False)

ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1892
1,tt0000002,5.9,252
2,tt0000003,6.5,1682
3,tt0000004,5.7,165
4,tt0000005,6.2,2499


In [12]:
# Replace "\N" with np.nan: 

ratings.replace({'\\N':np.nan}, inplace =True) 

ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1892
1,tt0000002,5.9,252
2,tt0000003,6.5,1682
3,tt0000004,5.7,165
4,tt0000005,6.2,2499


# **Filtering one DataFrame based on another**

Filter the basics df to only include the movies that are present in your filter akas dataframe. This is how you will ultimately be able to filter the movies by region being in the US.  

In [21]:
# Filter the basics table down to only include the US by using the filter akas dataframe:

ustitles = basics['tconst'].isin(akas['titleId'])
basics[ustitles]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
9018666,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
9018704,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
9018713,tt9916720,short,The Nun 2,The Nun 2,0,2019,\N,10,"Comedy,Horror,Mystery"
9018728,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


# **Creating a "Data" folder**

In [22]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [23]:
# Save current dataframe to file:

basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [24]:
# Save current dataframe to file:

akas.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [25]:
# Save current dataframe to file:

ratings.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)