# ***Project 3***
## Jonathan Jones
### 22.06.23

In [2]:
# Import libraries:

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [3]:
# Load Data:

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

# DataFrame: 

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

basics.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# ***Preprocessing***

## Title Basics Filtering/Cleaning Steps:

1. Replace "\N" with np.nan

2. Eliminate movies that are null for runtimeMinutes

3. Eliminate movies that are null for genre keep only titleType==Movie

4. keep startYear 2000-2022

5. Eliminate movies that include "Documentary" in genre (see tip below)


In [4]:
# Replace "\N" with np.nan

basics.replace({'\\N':np.nan}, inplace =True)

In [5]:
basics['runtimeMinutes'].values

array(['1', '5', '4', ..., nan, '27', '10'], dtype=object)

In [None]:
basics.info()

In [None]:
# Eliminate movies that are null for runtimeMinutes

basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics['runtimeMinutes'].value_counts()

In [None]:
basics['runtimeMinutes'].isna().sum()

In [None]:
# Eliminate movies that are null for genre:

basics.dropna(subset=['genres'], inplace=True)

basics['genres'].value_counts()

In [None]:
# Keep only titleType==Movie: 

movie = basics['titleType'] == 'movie'

basics = basics[movie]

basics.head()

In [None]:
basics['startYear'].value_counts()

In [None]:
# keep startYear 2000-2022:

# Conversion to date time 

basics['startYear'] =pd.to_datetime(basics['startYear'])


start_date = '2000'
end_date = '2023'

daterange = (basics['startYear'] >= start_date) & (basics['startYear'] <= end_date)

basics = basics.loc[daterange]

basics.head()

In [None]:
# Eliminate movies that include "Documentary" in genre (see tip below):

documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary]

basics.head()

basics['genres'].value_counts()

## AKAs Filtering/Cleaning Steps:

1. keep only US entries.

2. Replace "\N" with np.nan

In [None]:
# Load Data:

akas_url ="https://datasets.imdbws.com/title.akas.tsv.gz"

# DataFrame: 

akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

akas.head(5)

In [None]:
# Keep only US entires:

us = akas['region'] == 'US'

akas = akas[us]

akas.head()

In [None]:
# Replace "\N" with np.nan: 

akas.replace({'\\N':np.nan}, inplace =True) 

akas.head()

## Ratings Filtering/Cleaning Steps:

In [None]:
# Load Data:

rating_url ="https://datasets.imdbws.com/title.ratings.tsv.gz"

# DataFrame: 
ratings = pd.read_csv(rating_url, sep='\t', low_memory=False)

ratings.head()

In [None]:
# Replace "\N" with np.nan: 

ratings.replace({'\\N':np.nan}, inplace =True) 

ratings.head()

# Filtering one DataFrame based on another

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe:

ustitles = basics['tconst'].isin(akas['titleId'])
basics[ustitles]

## Creating a "Data" folder

In [None]:
# Make new folder with os:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
# Save dataframes to folder:

basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

akas.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

ratings.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)