# *Preliminary Work*

---

## 0.1 - Importing the Required Modules

In [1]:
import pandas as pd
import numpy as np

##
---

## 0.2 - Importing and Cleaning the Dataset in `imdb.title.basics.csv`

### 0.2.1 - Import the Dataset in `imbd.title.basics`

In [2]:
imdb_titles_df = pd.read_csv("data/imdb.title.basics.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/imdb.title.basics.csv'

### 0.2.2 - Drop the Unnecessary Columns in `imdb_titles_df`

In [None]:
imdb_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [None]:
imdb_titles_df.drop(["original_title", "runtime_minutes"], axis=1, inplace=True)

In [None]:
imdb_titles_df.columns

Index(['tconst', 'primary_title', 'start_year', 'genres'], dtype='object')

### 0.2.3 - Rename the Relevant Columns in `imdb_titles_df`

In [None]:
imdb_titles_df.rename(columns={"tconst": "movie_ID", "primary_title": "title", "start_year": "year"}, inplace=True)

In [None]:
imdb_titles_df.columns

Index(['movie_ID', 'title', 'year', 'genres'], dtype='object')

### 0.2.4 - Drop Rows Containing Null Values in `imdb_titles_df`

In [None]:
imdb_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   movie_ID  146144 non-null  object
 1   title     146144 non-null  object
 2   year      146144 non-null  int64 
 3   genres    140736 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.5+ MB


In [None]:
imdb_titles_df.dropna(inplace=True)

In [None]:
imdb_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140736 entries, 0 to 146143
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   movie_ID  140736 non-null  object
 1   title     140736 non-null  object
 2   year      140736 non-null  int64 
 3   genres    140736 non-null  object
dtypes: int64(1), object(3)
memory usage: 5.4+ MB


### 0.2.5 - Convert Entries in the `genre` Column from Strings to Arrays

In [None]:
imdb_titles_df["genres"]

0            Action,Crime,Drama
1               Biography,Drama
2                         Drama
3                  Comedy,Drama
4          Comedy,Drama,Fantasy
                  ...          
146138    Adventure,History,War
146139                    Drama
146140              Documentary
146141                   Comedy
146143              Documentary
Name: genres, Length: 140736, dtype: object

In [None]:
imdb_titles_df["genres"] = imdb_titles_df["genres"].str.split(',')

In [None]:
imdb_titles_df["genres"]

0            [Action, Crime, Drama]
1                [Biography, Drama]
2                           [Drama]
3                   [Comedy, Drama]
4          [Comedy, Drama, Fantasy]
                    ...            
146138    [Adventure, History, War]
146139                      [Drama]
146140                [Documentary]
146141                     [Comedy]
146143                [Documentary]
Name: genres, Length: 140736, dtype: object

### 0.2.6 - View the Cleaned `imdb_titles_df` DataFrame

In [None]:
imdb_titles_df.head()

Unnamed: 0,movie_ID,title,year,genres
0,tt0063540,Sunghursh,2013,"[Action, Crime, Drama]"
1,tt0066787,One Day Before the Rainy Season,2019,"[Biography, Drama]"
2,tt0069049,The Other Side of the Wind,2018,[Drama]
3,tt0069204,Sabse Bada Sukh,2018,"[Comedy, Drama]"
4,tt0100275,The Wandering Soap Opera,2017,"[Comedy, Drama, Fantasy]"


##
---

## 0.3 - Creating a DataFrame (`id_to_genre_map`) that Maps *Movie ID* to *Genres*

### 0.3.1 - Initialize `id_to_genre_map` by Slicing from the `imbd_titles_df` DataFrame

In [None]:
id_to_genre_map = imdb_titles_df.loc[:, ["movie_ID", "genres"]]

In [None]:
id_to_genre_map.head()

Unnamed: 0,movie_ID,genres
0,tt0063540,"[Action, Crime, Drama]"
1,tt0066787,"[Biography, Drama]"
2,tt0069049,[Drama]
3,tt0069204,"[Comedy, Drama]"
4,tt0100275,"[Comedy, Drama, Fantasy]"


### 0.3.2 - Set the `movie_ID` Column as the Index of `id_to_genre_map`

In [None]:
id_to_genre_map.set_index("movie_ID", inplace=True)

In [None]:
id_to_genre_map.head()

Unnamed: 0_level_0,genres
movie_ID,Unnamed: 1_level_1
tt0063540,"[Action, Crime, Drama]"
tt0066787,"[Biography, Drama]"
tt0069049,[Drama]
tt0069204,"[Comedy, Drama]"
tt0100275,"[Comedy, Drama, Fantasy]"


### 0.3.3 - Extract the Individual Genres in `id_to_genre_map[genres]` into Their Own Dedicated Columns

In [None]:
id_to_genre_map["genre_1"] = id_to_genre_map["genres"].map(lambda x: x[0])
id_to_genre_map["genre_2"] = id_to_genre_map["genres"].map(lambda x: x[1] if len(x) > 1 else None)
id_to_genre_map["genre_3"] = id_to_genre_map["genres"].map(lambda x: x[2] if len(x) > 2 else None)

In [None]:
id_to_genre_map.head()

Unnamed: 0_level_0,genres,genre_1,genre_2,genre_3
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0063540,"[Action, Crime, Drama]",Action,Crime,Drama
tt0066787,"[Biography, Drama]",Biography,Drama,
tt0069049,[Drama],Drama,,
tt0069204,"[Comedy, Drama]",Comedy,Drama,
tt0100275,"[Comedy, Drama, Fantasy]",Comedy,Drama,Fantasy


### 0.3.4 - Drop the `genres` Column in `id_to_genre_map`

In [None]:
id_to_genre_map.drop(columns="genres", inplace=True)

In [None]:
id_to_genre_map.head()

Unnamed: 0_level_0,genre_1,genre_2,genre_3
movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0063540,Action,Crime,Drama
tt0066787,Biography,Drama,
tt0069049,Drama,,
tt0069204,Comedy,Drama,
tt0100275,Comedy,Drama,Fantasy


##
---

## 0.4 - Creating a DataFrame (`titleyear_to_id_map`) that Maps *Movie Title* and *Year of Release* to *Movie ID* 

### 0.4.1 - Initialize `titleyear_to_id_map` by Slicing from the `imbd_titles_df` DataFrame

In [None]:
titleyear_to_id_map = imdb_titles_df.loc[:, ["movie_ID", "title", "year"]]

In [None]:
titleyear_to_id_map.head()

Unnamed: 0,movie_ID,title,year
0,tt0063540,Sunghursh,2013
1,tt0066787,One Day Before the Rainy Season,2019
2,tt0069049,The Other Side of the Wind,2018
3,tt0069204,Sabse Bada Sukh,2018
4,tt0100275,The Wandering Soap Opera,2017


### 0.4.2 - Set the `title` and `year` Columns as the Index of `titleyear_to_id_map`

In [None]:
titleyear_to_id_map.set_index(["title", "year"], inplace=True)

In [None]:
titleyear_to_id_map.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,movie_ID
title,year,Unnamed: 2_level_1
Sunghursh,2013,tt0063540
One Day Before the Rainy Season,2019,tt0066787
The Other Side of the Wind,2018,tt0069049
Sabse Bada Sukh,2018,tt0069204
The Wandering Soap Opera,2017,tt0100275


##
---

## 0.5 - Exporting the Mapping DataFrames

### 0.5.1 - Create `maps` Directory 

In [None]:
mkdir maps

A subdirectory or file maps already exists.


### 0.5.2 - Export Mapping DataFrames to `map` Directory

In [None]:
id_to_genre_map.to_csv("maps/id_to_genre_map.csv")
titleyear_to_id_map.to_csv("maps/titleyear_to_id_map.csv")

##
---