In [1]:
import pandas as pd

## 1. Data Cleaning

### TV Series Data

In [2]:
series_df = pd.read_csv('data/tv_data.csv')
series_df.head(3)

Unnamed: 0,adult,backdrop_path,created_by,episode_run_time,first_air_date,genres,homepage,id,in_production,languages,...,poster_path,production_companies,production_countries,seasons,spoken_languages,status,tagline,type,vote_average,vote_count
0,False,/etj8E2o0Bud0HkONVQPjyCkIvpv.jpg,"[{'id': 237053, 'credit_id': '5db8d867a1d33200...",[],2022-08-21,"[{'id': 10765, 'name': 'Sci-Fi & Fantasy'}, {'...",https://www.hbo.com/house-of-the-dragon,94997,True,['en'],...,/7QMsOTMUswlwxJP0rTTZfmz2tX2.jpg,"[{'id': 3268, 'logo_path': '/tuomPhY2UtuPTqqFn...","[{'iso_3166_1': 'US', 'name': 'United States o...","[{'air_date': '2022-08-21', 'episode_count': 5...","[{'english_name': 'English', 'iso_639_1': 'en'...",Returning Series,All must choose.,Scripted,8.422,4417
1,False,/mZCq3ldk7hUIyDvfZIOvTrxPWYS.jpg,[],[],2022-08-09,"[{'id': 10764, 'name': 'Reality'}]",,209374,True,['es'],...,/6Gy1ReRZ9sK9g8TPXZGz7CcQvrV.jpg,"[{'id': 59028, 'logo_path': '/lV4ZJIG0MrYgqFZz...","[{'iso_3166_1': 'CO', 'name': 'Colombia'}]","[{'air_date': '2022-08-08', 'episode_count': 3...","[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Returning Series,,Reality,5.6,8
2,False,/2Bkqjc9tNWo3KLr1aRYyJ7vMQtM.jpg,[],[],2023-10-22,"[{'id': 10764, 'name': 'Reality'}]",https://www.kijk.nl/programmas/de-bondgenoten,235493,True,['nl'],...,/wsipinibYxrZhpkQ6MKxIMgL0hv.jpg,"[{'id': 65119, 'logo_path': '/kejyjdzZg04SMRkR...","[{'iso_3166_1': 'NL', 'name': 'Netherlands'}]","[{'air_date': '2023-10-22', 'episode_count': 2...","[{'english_name': 'Dutch', 'iso_639_1': 'nl', ...",Returning Series,,Reality,6.875,16


In [3]:
### Unncessary column, that are not required in a recommendation engine can be dropped.

drop_columns = [
    'adult', 'backdrop_path', 'created_by', 'episode_run_time', 'homepage',
    'id', 'languages', 'last_air_date', 'last_episode_to_air', 'next_episode_to_air',
    'networks', 'number_of_episodes', 'number_of_seasons', 'origin_country',
    'original_language', 'original_name', 'poster_path', 'production_companies',
    'production_countries', 'seasons', 'spoken_languages', 'status', 'tagline', 'type'
]


series_df = series_df.drop(columns=drop_columns)

In [4]:
series_df.isna().value_counts()

first_air_date  genres  in_production  name   overview  popularity  vote_average  vote_count
False           False   False          False  False     False       False         False         8191
                                              True      False       False         False         1000
True            False   False          False  False     False       False         False           20
                                              True      False       False         False            1
Name: count, dtype: int64

In [5]:
### Missing data can be filled with empty strings

series_df.fillna('', inplace=True)

### Movies Data

In [6]:
movies_df = pd.read_csv("data/movies_data.csv")
movies_df.head(3)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/xg27NrXi7VXCGUr7MG75UqLl6Vg.jpg,"{'id': 1022790, 'name': 'Inside Out Collection...",200000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",https://movies.disney.com/inside-out-2,1022789,tt22022452,['US'],en,...,2024-06-11,1350092077,97,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Make room for new emotions.,Inside Out 2,False,7.695,1847
1,False,/fDmci71SMkfZM8RnCuXJVDPaSdE.jpg,"{'id': 86066, 'name': 'Despicable Me Collectio...",100000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",https://www.despicable.me,519182,tt7510222,['US'],en,...,2024-06-20,437809340,95,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Things just got a little more despicable.,Despicable Me 4,False,7.467,288
2,False,/Akv9GlCCMrzcDkVz4ad8MdLl9DK.jpg,,60000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",https://www.thegarfield-movie.com/,748783,tt5779228,"['US', 'GB']",en,...,2024-04-30,244865603,101,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Indoor cat. Outdoor adventure.,The Garfield Movie,False,7.253,434


In [7]:
drop_columns = [
    'adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'homepage',
    'id', 'imdb_id', 'origin_country', 'poster_path', 'production_companies',
    'production_countries', 'revenue', 'spoken_languages', 'status', 'tagline', 'video'
]


movies_df = movies_df.drop(columns=drop_columns)

In [8]:
movies_df.fillna('', inplace=True)

## 2. Data Preprocessing

In [9]:
### Helper function to parse genres
import ast

def parse_genres(genres):
    """
    Parses the genres string and returns a list of genre names.

    Args:
        genres (str): A string representation of genres.

    Returns:
        list: A list of genre names.

    """
    if isinstance(genres, str):
        genres = ast.literal_eval(genres)
        return [genre['name'] for genre in genres]

    return []

In [10]:
movies_df['genres'] = movies_df['genres'].apply(parse_genres)

In [11]:
series_df['genres'] = series_df['genres'].apply(parse_genres)

In [12]:
### Creating a multi label binarizer
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# One-hot encode genres for movies
movie_genres_encoded = mlb.fit_transform(movies_df['genres'])
movie_genres_df = pd.DataFrame(movie_genres_encoded, columns=mlb.classes_)

# One-hot encode genres for TV series using the same MultiLabelBinarizer
series_genres_encoded = mlb.transform(series_df['genres'])
series_genres_df = pd.DataFrame(series_genres_encoded, columns=mlb.classes_)



In [13]:
### Ensure both dataframes have the same genre columns
movie_genres_df = movie_genres_df.reindex(columns=mlb.classes_, fill_value=0)
series_genres_df = series_genres_df.reindex(columns=mlb.classes_, fill_value=0)

In [14]:
### Concatenating the encoded genres with the original dataframes
movies_df = pd.concat([movies_df.drop(columns=['genres']), movie_genres_df], axis=1)
series_df = pd.concat([series_df.drop(columns=['genres']), series_genres_df], axis=1)

In [15]:
movies_df.head(3)

Unnamed: 0,original_language,original_title,overview,popularity,release_date,runtime,title,vote_average,vote_count,Action,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,en,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,5696.178,2024-06-11,97,Inside Out 2,7.695,1847,0,...,0,0,0,0,0,0,0,0,0,0
1,en,Despicable Me 4,"Gru and Lucy and their girls — Margo, Edith an...",4379.882,2024-06-20,95,Despicable Me 4,7.467,288,1,...,0,0,0,0,0,0,0,0,0,0
2,en,The Garfield Movie,"Garfield, the world-famous, Monday-hating, las...",2381.187,2024-04-30,101,The Garfield Movie,7.253,434,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
series_df.head(3)

Unnamed: 0,first_air_date,in_production,name,overview,popularity,vote_average,vote_count,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2022-08-21,True,House of the Dragon,The Targaryen dynasty is at the absolute apex ...,4461.815,8.422,4417,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022-08-09,True,Top Chef VIP,,3335.632,5.6,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2023-10-22,True,De Bondgenoten,,2350.447,6.875,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. Normalizing the data

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [19]:
movies_df[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(movies_df[['popularity', 'vote_average', 'vote_count']])
series_df[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(series_df[['popularity', 'vote_average', 'vote_count']])

In [20]:
### Extract year from release_date for movies
movies_df['release_year'] = pd.to_datetime(movies_df['release_date']).dt.year
movies_df.drop(columns=['release_date'], inplace=True)

series_df['first_air_year'] = pd.to_datetime(series_df['first_air_date']).dt.year
series_df.drop(columns=['first_air_date'], inplace=True)

In [21]:
### Create dummy variables for original_language in movies
movies_df = pd.get_dummies(movies_df, columns=['original_language'], drop_first=True)

### Create dummy variables for in_production in TV series
seried_df = pd.get_dummies(series_df, columns=['in_production'], drop_first=True)


In [22]:
movies_df.head(3)

Unnamed: 0,original_title,overview,popularity,runtime,title,vote_average,vote_count,Action,Adventure,Animation,...,original_language_te,original_language_th,original_language_tl,original_language_tn,original_language_tr,original_language_uk,original_language_ur,original_language_vi,original_language_zh,original_language_zu
0,Inside Out 2,Teenager Riley's mind headquarters is undergoi...,1.0,97,Inside Out 2,0.7695,0.051304,0,1,1,...,False,False,False,False,False,False,False,False,False,False
1,Despicable Me 4,"Gru and Lucy and their girls — Margo, Edith an...",0.768915,95,Despicable Me 4,0.7467,0.008,1,0,1,...,False,False,False,False,False,False,False,False,False,False
2,The Garfield Movie,"Garfield, the world-famous, Monday-hating, las...",0.41803,101,The Garfield Movie,0.7253,0.012055,0,1,1,...,False,False,False,False,False,False,False,False,False,False


In [23]:
seried_df.head(3)

Unnamed: 0,name,overview,popularity,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,...,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,first_air_year,in_production_True
0,House of the Dragon,The Targaryen dynasty is at the absolute apex ...,1.0,0.8422,0.188398,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2022.0,True
1,Top Chef VIP,,0.747172,0.56,0.000341,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2022.0,True
2,De Bondgenoten,,0.525997,0.6875,0.000682,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023.0,True


In [24]:
movies_df.to_csv('data/cleaned_movies_data.csv', index=False)
seried_df.to_csv('data/cleaned_series_data.csv', index=False)