## Cleaning and Merging the datasets

In [280]:
# Import necessary libraries
import pandas as pd

In [281]:
# Load the datasets using raw string notation to avoid escape character issues
data1 = pd.read_csv('https://raw.githubusercontent.com/r41ss4/rennes_ba/refs/heads/main/data/cleaned/clean_datan1.csv')
data2 = pd.read_csv('https://raw.githubusercontent.com/r41ss4/rennes_ba/refs/heads/main/data/cleaned/clean_datan2.csv')
data3 = pd.read_csv('https://raw.githubusercontent.com/r41ss4/rennes_ba/refs/heads/main/data/cleaned/clean_datan3.csv')

In [282]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [283]:
# Function to determine if the entry is a movie or a show
def determine_type(duration_min):
    if duration_min > 60:
        return 'Movie'
    else:
        return 'Show'

In [284]:
# Apply the function to create the 'type' column
data1['type'] = data1['duration_min'].apply(determine_type)
# Review changes
data1

Unnamed: 0,title,year,certificate,duration_min,genre,rating,description,stars,votes,type
0,Better Call Saul,2015,TV-MA,46.00000,"Crime, Drama",8.900000,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384.000000,Show
1,Cyberpunk: Edgerunners,2022,TV-MA,24.00000,"Animation, Action, Adventure",8.600000,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413.000000,Show
2,1899,2022,TV-MA,60.00000,"Drama, History, Horror",9.600000,Multinational immigrants traveling from the ol...,"['Ben Ashenden, ', 'Aneurin Barnard, ', 'Emily...",853.000000,Show
3,Grey's Anatomy,2005,TV-14,41.00000,"Drama, Romance",7.600000,A drama centered on the personal and professio...,"['Ellen Pompeo, ', 'Chandra Wilson, ', 'James ...",303617.000000,Show
4,The Accidental Narco,2022,TV-MA,50.00000,"Action, Biography, Crime",7.400000,A civilian businessman who has no choice but t...,"['Ha Jung-woo, ', 'Hwang Jung-min, ', 'Park Ha...",2056.000000,Show
...,...,...,...,...,...,...,...,...,...,...
7535,Salmo ft Coez: Sparare alla luna,2018,TV-MA,80.57364,Music,6.500751,Add a Plot,['Younuts'],19937.375498,Movie
7536,American Masters,1985,Not Rated,87.00000,"Documentary, Biography, Music",7.600000,"Archival footage, live performances, and inter...","['Bruce Ricker', '| ', ' Stars:', 'Tony Ben...",165.000000,Movie
7537,The Weekly with Wendy Mesley,2018,TV-MA,80.57364,"News, Reality-TV",6.500751,How did Ontario's popular new Premier get so u...,"['Wendy Mesley, ', 'Billy Porter, ', 'Jacob To...",19937.375498,Movie
7538,The Drew Barrymore Show,2020,TV-PG,44.00000,Talk-Show,6.000000,In the inaugural episode of The Drew Barrymore...,"['Adam Heydt', '| ', ' Stars:', 'Drew Barry...",19.000000,Show


In [285]:
# Rename the column 'runtime' to 'duration_min'
data3.rename(columns={'runtime': 'duration_min'}, inplace=True)

In [286]:
# Add a decimal place to the 'duration_min' column
data3['duration_min'] = data3['duration_min'].astype(float).round(1)

In [287]:
# Apply the function to create the 'type' column
data3['type'] = data3['duration_min'].apply(determine_type)
# Review changes
data3

Unnamed: 0,title,genre,language,imdb_score,premiere,duration_min,year,type
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83.0,2020-01-01,Movie
1,"To Each, Her Own",Romantic comedy,French,5.3,6/24/2018,95.0,2018-01-01,Movie
2,The Lovebirds,Romantic comedy,English,6.1,5/22/2020,87.0,2020-01-01,Movie
3,Happy Anniversary,Romantic comedy,English,5.8,3/30/2018,78.0,2018-01-01,Movie
4,Why Did You Kill Me?,Documentary,English,5.6,4/14/2021,83.0,2021-01-01,Movie
...,...,...,...,...,...,...,...,...
520,Rolling Thunder Revue: A bob Dylan Story by Ma...,Documentary,English,7.6,6/12/2019,144.0,2019-01-01,Movie
521,Freaks: You're One of Us,Supernatural drama,German,5.4,9/2/2020,92.0,2020-01-01,Movie
522,Squared Love,Romantic comedy,Polish,5.0,2/11/2021,102.0,2021-01-01,Movie
523,My Own Man,Documentary,English,6.4,12/13/2014,81.0,2014-01-01,Movie


In [288]:
# Convert the 'year' column to a datetime object if it's not already for data3
data3['year'] = pd.to_datetime(data3['year'], errors='coerce')  # Coerce invalid formats to NaT

In [289]:
# Extract only the year part
data3['year'] = data3['year'].dt.year.astype(str)

In [290]:
# Preview the cleaned year column
print(data3[['title', 'year']].head())

                  title  year
0      Notes for My Son  2020
1      To Each, Her Own  2018
2         The Lovebirds  2020
3     Happy Anniversary  2018
4  Why Did You Kill Me?  2021


In [291]:
# Transform the 'rating score' column for data2
data2['rating'] = (data2['user rating score'] / 10).round(1)

In [292]:
# Drop the old 'user rating score' column if no longer needed
data2 = data2.drop(columns=['user rating score'])

In [293]:
# Preview the new rate column
print(data2[['title', 'rating']].head())

                   title  rating
0           White Chicks     8.2
1             Death Note     7.7
2                 Naruto     8.8
3             The Hunter     7.7
4  Lottie Dottie Chicken     7.7


In [294]:
# Rename the column 'imdb_score' to 'rating' from data 3
data3.rename(columns={'imdb_score': 'rating'}, inplace=True)

In [295]:
# Preview the new rate column
print(data3.head())

                  title            genre language  rating    premiere  \
0      Notes for My Son            Drama  Spanish     6.3  11/24/2020   
1      To Each, Her Own  Romantic comedy   French     5.3   6/24/2018   
2         The Lovebirds  Romantic comedy  English     6.1   5/22/2020   
3     Happy Anniversary  Romantic comedy  English     5.8   3/30/2018   
4  Why Did You Kill Me?      Documentary  English     5.6   4/14/2021   

   duration_min  year   type  
0          83.0  2020  Movie  
1          95.0  2018  Movie  
2          87.0  2020  Movie  
3          78.0  2018  Movie  
4          83.0  2021  Movie  


In [296]:
# Rename the column 'release year' to 'year' from data 2
data2.rename(columns={'release year': 'year'}, inplace=True)

In [297]:
# Preview the new year column
data2.head()

Unnamed: 0,title,rating,ratingLevel,ratingDescription,year,user rating size
0,White Chicks,8.2,"crude and sexual humor, language and some drug...",80,2004,80
1,Death Note,7.7,Parents strongly cautioned. May be unsuitable ...,90,2006,80
2,Naruto,8.8,Parental guidance suggested. May not be suitab...,70,2008,80
3,The Hunter,7.7,language and brief violence,100,2011,82
4,Lottie Dottie Chicken,7.7,Suitable for all ages.,10,2009,82


In [298]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [299]:
# Check for missing values
print("Data 1:")
print(data1.isnull().sum())

print("\nData 2:")
print(data2.isnull().sum())

print("\nData 3:")
print(data3.isnull().sum())

Data 1:
title           0
year            0
certificate     0
duration_min    0
genre           0
rating          0
description     0
stars           0
votes           0
type            0
dtype: int64

Data 2:
title                0
rating               0
ratingLevel          0
ratingDescription    0
year                 0
user rating size     0
dtype: int64

Data 3:
title           0
genre           0
language        0
rating          0
premiere        0
duration_min    0
year            0
type            0
dtype: int64


In [300]:
# Step 1: Merge Dataset 1 and Dataset 2 on 'title' , 'year' and 'rating'
merged_1_2 = pd.merge(data1, data2, on=['title', 'year' , 'rating'], how='outer', suffixes=('_data1', '_data2'))

In [301]:
# Convert 'year' column to int in both DataFrames to ensure they are of the same type
merged_1_2['year'] = merged_1_2['year'].astype(int)  # Convert to int
data3['year'] = data3['year'].astype(int)  # Convert to int

In [302]:
# Step 2: Merge the result with Dataset 3 on 'title' , 'year' , 'rating' and 'genre'
final_merged_data = pd.merge(merged_1_2, data3, on=['title', 'year', 'rating', 'genre'], how='outer', suffixes=('', '_data3'))

In [303]:
# Preview the new dataset
print(final_merged_data.head())

                           title  year certificate  duration_min  \
0                       #ABtalks  2018       TV-MA      80.57364   
1                         #Alive  2020       TV-MA      98.00000   
2  #AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       #BlackAF  2020       TV-MA      36.00000   
4            #FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  #ABtalks is an interview show hosted by Anas B...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                   

In [304]:
# Check for missing values
print(final_merged_data.isnull().sum())

title                    0
year                     0
certificate            734
duration_min           734
genre                  249
rating                   0
description            734
stars                  734
votes                  734
type                   734
ratingLevel           8023
ratingDescription     8023
user rating size      8023
language              7749
premiere              7749
duration_min_data3    7749
type_data3            7749
dtype: int64


In [305]:
# Remove '#' from the 'title' column in the merged dataset
final_merged_data['title'] = final_merged_data['title'].str.replace('#', '', regex=False)

In [306]:
# Preview the new dataset
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      80.57364   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  #ABtalks is an interview show hosted by Anas B...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

In [307]:
# Remove '#' from the 'description' column in the merged dataset
final_merged_data['description'] = final_merged_data['description'].str.replace('#', '', regex=False)

In [308]:
# Preview the new dataset
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      80.57364   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  ABtalks is an interview show hosted by Anas Bu...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

## First Exploratory Data Analysis (EDA)

In [310]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [311]:
# Check the first few rows of the data
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      80.57364   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  ABtalks is an interview show hosted by Anas Bu...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

In [312]:
# Get a summary of the dataframe
print(final_merged_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8274 entries, 0 to 8273
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               8274 non-null   object 
 1   year                8274 non-null   int64  
 2   certificate         7540 non-null   object 
 3   duration_min        7540 non-null   float64
 4   genre               8025 non-null   object 
 5   rating              8274 non-null   float64
 6   description         7540 non-null   object 
 7   stars               7540 non-null   object 
 8   votes               7540 non-null   float64
 9   type                7540 non-null   object 
 10  ratingLevel         251 non-null    object 
 11  ratingDescription   251 non-null    float64
 12  user rating size    251 non-null    float64
 13  language            525 non-null    object 
 14  premiere            525 non-null    object 
 15  duration_min_data3  525 non-null    float64
 16  type_d

In [313]:
# Describe the numerical columns
print(final_merged_data.describe())

              year  duration_min       rating         votes  \
count  8274.000000   7540.000000  8274.000000  7.540000e+03   
mean   1873.032149     80.573640     6.523805  1.993738e+04   
std     517.216710     45.060463     1.116449  7.318313e+04   
min       0.000000      1.000000     1.700000  5.000000e+00   
25%    2014.000000     59.000000     5.900000  5.100000e+02   
50%    2018.000000     80.573640     6.500751  2.421000e+03   
75%    2020.000000     98.000000     7.300000  1.993738e+04   
max    2025.000000    990.000000     9.800000  1.819157e+06   

       ratingDescription  user rating size  duration_min_data3  
count         251.000000        251.000000          525.000000  
mean           71.254980         81.215139           93.396190  
std            32.681841          0.976489           26.941185  
min            10.000000         80.000000            4.000000  
25%            41.000000         80.000000           86.000000  
50%            70.000000         82.000000

In [314]:
# Check for missing values
print(final_merged_data.isnull().sum())

title                    0
year                     0
certificate            734
duration_min           734
genre                  249
rating                   0
description            734
stars                  734
votes                  734
type                   734
ratingLevel           8023
ratingDescription     8023
user rating size      8023
language              7749
premiere              7749
duration_min_data3    7749
type_data3            7749
dtype: int64


In [315]:
# Load the new dataset using raw string notation to avoid escape character issues
data4 = pd.read_csv('https://raw.githubusercontent.com/r41ss4/rennes_ba/refs/heads/main/data/cleaned/clean_datan4.csv')

In [316]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

print("\nMerge Data:")
print(final_merged_data.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [317]:
# Rename the column 'release_year' to 'year' from data 4
data4.rename(columns={'release_year': 'year'}, inplace=True)

In [318]:
# Drop the old 'id' and 'imdb_id'column if no longer needed
data4 = data4.drop(columns=['id', 'imdb_id'])

In [319]:
# Create the 'rating' column as the average of 'imdb_score' and 'tmdb_score', rounded to 1 decimal place
data4['rating'] = data4[['imdb_score', 'tmdb_score']].mean(axis=1).round(1)

In [320]:
# Drop the old 'id' and 'imdb_id'column if no longer needed
data4 = data4.drop(columns=['imdb_score', 'tmdb_score'])

In [321]:
# Display the updated DataFrame 4
print(data4.head())

                                 title   type  \
0  Five Came Back: The Reference Films   SHOW   
1                                Rocky  MOVIE   
2                               Grease  MOVIE   
3                            The Sting  MOVIE   
4                             Rocky II  MOVIE   

                                         description  year age_certification  \
0  This collection includes 12 World War II-era p...  1945             TV-MA   
1  When world heavyweight boxing champion, Apollo...  1976                PG   
2  Australian good girl Sandy and greaser Danny f...  1978                PG   
3  A novice con man teams up with an acknowledged...  1973                PG   
4  After Rocky goes the distance with champ Apoll...  1979                PG   

   runtime                                 genres production_countries  \
0     51.0                      ['documentation']               ['US']   
1    119.0                     ['drama', 'sport']               ['US']   
2 

In [322]:
# Remove '[' and ']' from the 'genres' column
data4['genre'] = data4['genres'].str.strip("[]").str.replace("'", "")

In [323]:
# Drop the old 'genres' column (optional)
data4 = data4.drop(columns=['genres'])

In [324]:
# Display the updated DataFrame 4
print(data4.head())

                                 title   type  \
0  Five Came Back: The Reference Films   SHOW   
1                                Rocky  MOVIE   
2                               Grease  MOVIE   
3                            The Sting  MOVIE   
4                             Rocky II  MOVIE   

                                         description  year age_certification  \
0  This collection includes 12 World War II-era p...  1945             TV-MA   
1  When world heavyweight boxing champion, Apollo...  1976                PG   
2  Australian good girl Sandy and greaser Danny f...  1978                PG   
3  A novice con man teams up with an acknowledged...  1973                PG   
4  After Rocky goes the distance with champ Apoll...  1979                PG   

   runtime production_countries  seasons  imdb_votes  tmdb_popularity  \
0     51.0               ['US']        1      2095.0            0.601   
1    119.0               ['US']        0    588100.0          106.361   
2    

In [325]:
# Rename the column 'age_certification' to 'certification' from data 4
data4.rename(columns={'age_certification' : 'certification'}, inplace=True)

In [326]:
# Remove '[' and ']' from the 'production_countries' column
data4['production_countries'] = data4['production_countries'].str.strip("[]").str.replace("'", "")

In [327]:
# Display the updated DataFrame 4
print(data4.head())

                                 title   type  \
0  Five Came Back: The Reference Films   SHOW   
1                                Rocky  MOVIE   
2                               Grease  MOVIE   
3                            The Sting  MOVIE   
4                             Rocky II  MOVIE   

                                         description  year certification  \
0  This collection includes 12 World War II-era p...  1945         TV-MA   
1  When world heavyweight boxing champion, Apollo...  1976            PG   
2  Australian good girl Sandy and greaser Danny f...  1978            PG   
3  A novice con man teams up with an acknowledged...  1973            PG   
4  After Rocky goes the distance with champ Apoll...  1979            PG   

   runtime production_countries  seasons  imdb_votes  tmdb_popularity  \
0     51.0                   US        1      2095.0            0.601   
1    119.0                   US        0    588100.0          106.361   
2    110.0                   

In [328]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [329]:
# Remove '[' and ']' from the 'stars' column
data1['stars'] = data1['stars'].str.strip("[]").str.replace("'", "")

In [330]:
# Rename the column 'ratingLevel' to 'warnings' from data 2
data2.rename(columns={'ratingLevel' : 'warnings'}, inplace=True)

In [331]:
# Drop the old 'genres' column (optional) from data 3
data3 = data3.drop(columns=['premiere'])

In [332]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [333]:
# Drop the old 'tmdb_popularity' column from data 4
data4 = data4.drop(columns=['tmdb_popularity'])

In [334]:
# Rename the column 'imdb_votes' to 'votes' from data 4
data4.rename(columns={'imdb_votes' : 'votes'}, inplace=True)

In [335]:
# Drop the old 'ratingDescription' column from data 2
data2 = data2.drop(columns=['ratingDescription'])

In [336]:
# Drop the old 'title_age' column from data 4
data4 = data4.drop(columns=['title_age'])

In [337]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [338]:
# Clean up double commas and extra spaces in the 'stars' column on data 1
data1['stars'] = data1['stars'].str.replace(', ,', ',', regex=True).str.strip(', ').str.replace(', ,', ',')

In [339]:
# Rename the column 'certification' to 'certificate' from data 4
data4.rename(columns={'certification' : 'certificate'}, inplace=True)

In [340]:
# Rename the column 'runtime' to 'duration_min' from data 4
data4.rename(columns={'runtime' : 'duration_min'}, inplace=True)

In [341]:
# Rename the column 'user rating size' to 'votes' from data 2
data2.rename(columns={'user rating size' : 'votes'}, inplace=True)

In [342]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [343]:
# Check unique values in the 'language' column
print(data3['language'].unique())

['Spanish' 'French' 'English' 'Portuguese' 'English/Mandarin'
 'English/Spanish' 'German' 'Italian' 'Korean' 'Thia/English' 'Hindi'
 'Japanese' 'Marathi' 'Swedish' 'Indonesian' 'Dutch' 'Filipino'
 'Spanish/English' 'English/Taiwanese/Mandarin' 'Georgian'
 'English/Russian' 'Spanish/Catalan' 'English/Ukranian/Russian' 'Tamil'
 'Norwegian' 'Turkish' 'Polish' 'English/Swedish' 'Bengali' 'Thai'
 'English/Korean' 'English/Akan' 'Spanish/Basque' 'English/Japanese'
 'English/Hindi']


In [344]:
# Count the occurrences of each unique value in 'language'
print(data3['language'].value_counts())

language
English                       363
Spanish                        30
Hindi                          27
French                         20
Italian                        12
Portuguese                     11
Indonesian                      7
Japanese                        6
English/Spanish                 5
German                          5
Turkish                         5
Marathi                         3
Korean                          3
Polish                          3
Dutch                           2
Filipino                        2
English/Mandarin                2
Thai                            2
English/Japanese                1
Spanish/Basque                  1
English/Akan                    1
English/Korean                  1
Bengali                         1
English/Swedish                 1
Spanish/English                 1
Norwegian                       1
Tamil                           1
English/Ukranian/Russian        1
Spanish/Catalan                 1
Engli

In [345]:
# Get a summary of the dataframe 3
print(data3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         525 non-null    object 
 1   genre         525 non-null    object 
 2   language      525 non-null    object 
 3   rating        525 non-null    float64
 4   duration_min  525 non-null    float64
 5   year          525 non-null    int64  
 6   type          525 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 28.8+ KB
None


In [346]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

In [347]:
# Check unique values in the 'type' column
print(data4['type'].unique())

['SHOW' 'MOVIE']


In [348]:
# Change values in the 'type' column to have only the first letter capitalized
data4['type'] = data4['type'].str.capitalize()

In [349]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

print("\nData 4:")
print(data4.head())

Data 1:
                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

                             

## Merge

In [351]:
# List of datasets to merge
datasets = [data1, data2, data3, data4]

In [352]:
# Concatenate the datasets
merged_data = pd.concat(datasets, ignore_index=True)

In [353]:
# Fill missing numeric values with 0
numeric_columns = merged_data.select_dtypes(include=['number']).columns
merged_data[numeric_columns] = merged_data[numeric_columns].fillna(0)

In [354]:
# Fill missing non-numeric values with 'Unknown'
non_numeric_columns = merged_data.select_dtypes(exclude=['number']).columns
merged_data[non_numeric_columns] = merged_data[non_numeric_columns].fillna('Unknown')

In [355]:
# Display the merged dataset
print(merged_data.head())

                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

0  Bob Odenkirk, Rhea Seehorn, Jonath

## Second EDA

In [357]:
# Show the first few rows of the dataset
print(merged_data.head())

                    title  year certificate  duration_min  \
0        Better Call Saul  2015       TV-MA          46.0   
1  Cyberpunk: Edgerunners  2022       TV-MA          24.0   
2                    1899  2022       TV-MA          60.0   
3          Grey's Anatomy  2005       TV-14          41.0   
4    The Accidental Narco  2022       TV-MA          50.0   

                          genre  rating  \
0                  Crime, Drama     8.9   
1  Animation, Action, Adventure     8.6   
2        Drama, History, Horror     9.6   
3                Drama, Romance     7.6   
4      Action, Biography, Crime     7.4   

                                         description  \
0  The trials and tribulations of criminal lawyer...   
1  A Street Kid trying to survive in a technology...   
2  Multinational immigrants traveling from the ol...   
3  A drama centered on the personal and professio...   
4  A civilian businessman who has no choice but t...   

0  Bob Odenkirk, Rhea Seehorn, Jonath

In [358]:
# Get a summary of the dataset, including data types and non-null counts
print(merged_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14357 entries, 0 to 14356
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 14357 non-null  object 
 1   year                  14357 non-null  int64  
 2   certificate           14357 non-null  object 
 3   duration_min          14357 non-null  float64
 4   genre                 14357 non-null  object 
 5   rating                14357 non-null  float64
 6   description           14357 non-null  object 
 7   stars                 14357 non-null  object 
 8   votes                 14357 non-null  float64
 9   type                  14357 non-null  object 
 11  language              14357 non-null  object 
 12  production_countries  14357 non-null  object 
 13  seasons               14357 non-null  float64
dtypes: float64(4), int64(1), object(9)
memory usage: 1.5+ MB
None


In [359]:
# Show basic statistics for numeric columns
print(merged_data.describe())

               year  duration_min        rating         votes       seasons
count  14357.000000  14357.000000  14357.000000  1.435700e+04  14357.000000
mean    1934.206032     77.977728      6.551478  1.875788e+04      0.331476
std      399.085793     42.965932      1.069030  7.876891e+04      1.328092
min        0.000000      0.000000      1.700000  0.000000e+00      0.000000
25%     2015.000000     47.000000      6.000000  3.940000e+02      0.000000
50%     2018.000000     80.573640      6.600000  2.095000e+03      0.000000
75%     2020.000000    101.000000      7.300000  1.077500e+04      0.000000
max     2025.000000    990.000000      9.800000  2.684317e+06     44.000000


In [360]:
# Check for missing values in the dataset
print(merged_data.isnull().sum())

title                   0
year                    0
certificate             0
duration_min            0
genre                   0
rating                  0
description             0
stars                   0
votes                   0
type                    0
language                0
production_countries    0
seasons                 0
dtype: int64


In [361]:
# Find and remove duplicate rows
duplicates = merged_data[merged_data.duplicated()]

In [362]:
# Count duplicates
duplicate_count = merged_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [363]:
# Display the duplicated rows
duplicates = merged_data[merged_data.duplicated()]
print(duplicates)

Empty DataFrame
Index: []


In [364]:
# Display all rows that are duplicates (including the first occurrence)
duplicates_inclusive = merged_data[merged_data.duplicated(keep=False)]
print(duplicates_inclusive)

Empty DataFrame
Index: []


In [365]:
# Check duplicate rows based on specific columns (if you suspect certain columns)
duplicates_by_columns = merged_data[merged_data.duplicated(subset=['title'])]
print(duplicates_by_columns)

                 title  year certificate  duration_min  \
7542            Naruto  2008     Unknown           0.0   
7550       The Do-Over  2016     Unknown           0.0   
7552                3%  2016     Unknown           0.0   
7556        White Girl  2016     Unknown           0.0   
7563           Pandora  2016     Unknown           0.0   
...                ...   ...         ...           ...   
14308         Infiesto  2023     Unknown          96.0   
14326       Dear David  2023       PG-13         118.0   
14327  Call Me Chihiro  2023       PG-13         131.0   
14337       Rana Naidu  2023       TV-MA          48.0   
14342   Princess Power  2023        TV-Y          15.0   

                                    genre  rating  \
7542                              Unknown     8.8   
7550                              Unknown     8.4   
7552                              Unknown     7.7   
7556                              Unknown     7.7   
7563                              Unkn

In [366]:
# Define a function to count the non-missing values
def count_non_missing(row):
    # Replace 'Unknown' and 0 with NaN for easier counting of valid values
    return row.replace(['Unknown', 0], [pd.NA, pd.NA]).notna().sum()

In [367]:
# Apply the function to each row and create a new column 'non_missing_count'
merged_data['non_missing_count'] = merged_data.apply(count_non_missing, axis=1)

In [368]:
# Sort the data first by 'non_missing_count' in descending order, then by 'votes' in descending order
merged_data = merged_data.sort_values(by=['non_missing_count', 'votes'], ascending=[False, False])

In [369]:
# Remove duplicates based on the 'title' and 'year' columns, keeping the first (best) row
merged_data_cleaned = merged_data.drop_duplicates(subset=['title', 'year'], keep='first')

In [370]:
# Drop the helper 'non_missing_count' column
merged_data_cleaned = merged_data_cleaned.drop(columns=['non_missing_count'])

In [371]:
# Display the cleaned dataset
print(merged_data_cleaned)

                         title  year certificate  duration_min  \
8494              Breaking Bad  2008       TV-MA          48.0   
9215           Stranger Things  2016       TV-14          61.0   
8495          The Walking Dead  2010       TV-MA          46.0   
8978            Peaky Blinders  2013       TV-MA          58.0   
8831              Black Mirror  2011       TV-MA          59.0   
...                        ...   ...         ...           ...   
7773               Blank Check  1994     Unknown           0.0   
7774              Heavyweights  1995     Unknown           0.0   
7777      D2: The Mighty Ducks  1994     Unknown           0.0   
7778  Honey, I Shrunk the Kids  1989     Unknown           0.0   
7782       H2O: Just Add Water  2009     Unknown           0.0   

                                        genre  rating  \
8494           drama, comedy, crime, thriller     9.2   
9215  drama, scifi, thriller, fantasy, horror     8.6   
8495          action, drama, horror,

In [372]:
# Check for missing values in the dataset
print(merged_data_cleaned.isnull().sum())

title                   0
year                    0
certificate             0
duration_min            0
genre                   0
rating                  0
description             0
stars                   0
votes                   0
type                    0
language                0
production_countries    0
seasons                 0
dtype: int64


In [373]:
# Check unique values in the 'genre' column
print(merged_data_cleaned['genre'].unique())

['drama, comedy, crime, thriller'
 'drama, scifi, thriller, fantasy, horror'
 'action, drama, horror, thriller' ... 'Drama/Horror' 'Making-of'
 'Unknown']


In [374]:
# Capitalize the first letter of all string values in the dataset
for col in merged_data_cleaned.select_dtypes(include=['object']).columns:
    merged_data_cleaned[col] = merged_data_cleaned[col].str.title()

In [375]:
# Display the first few rows to verify the changes
print(merged_data_cleaned.head())

                 title  year certificate  duration_min  \
8494      Breaking Bad  2008       Tv-Ma          48.0   
9215   Stranger Things  2016       Tv-14          61.0   
8495  The Walking Dead  2010       Tv-Ma          46.0   
8978    Peaky Blinders  2013       Tv-Ma          58.0   
8831      Black Mirror  2011       Tv-Ma          59.0   

                                        genre  rating  \
8494           Drama, Comedy, Crime, Thriller     9.2   
9215  Drama, Scifi, Thriller, Fantasy, Horror     8.6   
8495          Action, Drama, Horror, Thriller     8.1   
8978                   Drama, Crime, European     8.7   
8831         Scifi, Drama, Thriller, European     8.6   

                                            description    stars      votes  \
8494  When Walter White, A New Mexico Chemistry Teac...  Unknown  1936461.0   
9215  When A Young Boy Vanishes, A Small Town Uncove...  Unknown  1220079.0   
8495  Sheriff'S Deputy Rick Grimes Awakens From A Co...  Unknown  10132

In [376]:
# Get the frequency distribution of the 'genre' column
genre_frequency = merged_data_cleaned['genre'].value_counts()

# Display the frequency distribution
print(genre_frequency)

genre
Comedy                                               994
Drama                                                696
Documentary                                          388
Drama, Romance                                       291
Comedy, Drama                                        260
                                                    ... 
Drama, Family, Animation, Comedy, Action               1
Scifi, Thriller                                        1
Action, Animation, Drama, Fantasy, Family, Comedy      1
Drama, Thriller, Crime, Romance                        1
Family, Animation, Action, Comedy, Crime, Scifi        1
Name: count, Length: 1917, dtype: int64


In [377]:
# Calculate percentage for each genre
genre_percentages = (genre_frequency / genre_frequency.sum()) * 100

# Combine frequency and percentage into a single DataFrame
genre_distribution = pd.DataFrame({
    'Frequency': genre_frequency,
    'Percentage (%)': genre_percentages.round(2)
})

In [378]:
# Display the frequency distribution with percentage
print(genre_distribution)

                                                   Frequency  Percentage (%)
genre                                                                       
Comedy                                                   994            9.10
Drama                                                    696            6.37
Documentary                                              388            3.55
Drama, Romance                                           291            2.66
Comedy, Drama                                            260            2.38
...                                                      ...             ...
Drama, Family, Animation, Comedy, Action                   1            0.01
Scifi, Thriller                                            1            0.01
Action, Animation, Drama, Fantasy, Family, Comedy          1            0.01
Drama, Thriller, Crime, Romance                            1            0.01
Family, Animation, Action, Comedy, Crime, Scifi            1            0.01

In [379]:
# Convert values in 'production_countries' and 'certificate' columns to uppercase
merged_data_cleaned['production_countries'] = merged_data_cleaned['production_countries'].str.upper()
merged_data_cleaned['certificate'] = merged_data_cleaned['certificate'].str.upper()

In [380]:
# Display the first few rows to verify the changes
print(merged_data_cleaned[['production_countries', 'certificate']].head())

     production_countries certificate
8494                   US       TV-MA
9215                   US       TV-14
8495                   US       TV-MA
8978                   GB       TV-MA
8831                   GB       TV-MA


In [381]:
# Display the first few rows to verify the changes
print(merged_data_cleaned.head())

                 title  year certificate  duration_min  \
8494      Breaking Bad  2008       TV-MA          48.0   
9215   Stranger Things  2016       TV-14          61.0   
8495  The Walking Dead  2010       TV-MA          46.0   
8978    Peaky Blinders  2013       TV-MA          58.0   
8831      Black Mirror  2011       TV-MA          59.0   

                                        genre  rating  \
8494           Drama, Comedy, Crime, Thriller     9.2   
9215  Drama, Scifi, Thriller, Fantasy, Horror     8.6   
8495          Action, Drama, Horror, Thriller     8.1   
8978                   Drama, Crime, European     8.7   
8831         Scifi, Drama, Thriller, European     8.6   

                                            description    stars      votes  \
8494  When Walter White, A New Mexico Chemistry Teac...  Unknown  1936461.0   
9215  When A Young Boy Vanishes, A Small Town Uncove...  Unknown  1220079.0   
8495  Sheriff'S Deputy Rick Grimes Awakens From A Co...  Unknown  10132

In [382]:
# Update the 'description' column to follow standard paragraph capitalization
merged_data_cleaned['description'] = merged_data_cleaned['description'].apply(
    lambda x: x.capitalize() if isinstance(x, str) else x
)

In [383]:
# Display the first few rows to verify the changes
print(merged_data_cleaned['description'].head())

8494    When walter white, a new mexico chemistry teac...
9215    When a young boy vanishes, a small town uncove...
8495    Sheriff's deputy rick grimes awakens from a co...
8978    A gangster family epic set in 1919 birmingham,...
8831    A contemporary british re-working of the twili...
Name: description, dtype: object


In [384]:
# Display the first few rows to verify the changes
print(merged_data_cleaned.head())

                 title  year certificate  duration_min  \
8494      Breaking Bad  2008       TV-MA          48.0   
9215   Stranger Things  2016       TV-14          61.0   
8495  The Walking Dead  2010       TV-MA          46.0   
8978    Peaky Blinders  2013       TV-MA          58.0   
8831      Black Mirror  2011       TV-MA          59.0   

                                        genre  rating  \
8494           Drama, Comedy, Crime, Thriller     9.2   
9215  Drama, Scifi, Thriller, Fantasy, Horror     8.6   
8495          Action, Drama, Horror, Thriller     8.1   
8978                   Drama, Crime, European     8.7   
8831         Scifi, Drama, Thriller, European     8.6   

                                            description    stars      votes  \
8494  When walter white, a new mexico chemistry teac...  Unknown  1936461.0   
9215  When a young boy vanishes, a small town uncove...  Unknown  1220079.0   
8495  Sheriff's deputy rick grimes awakens from a co...  Unknown  10132

In [385]:
# Display the info of the data
print(merged_data_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10925 entries, 8494 to 7782
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 10925 non-null  object 
 1   year                  10925 non-null  int64  
 2   certificate           10925 non-null  object 
 3   duration_min          10925 non-null  float64
 4   genre                 10925 non-null  object 
 5   rating                10925 non-null  float64
 6   description           10925 non-null  object 
 7   stars                 10925 non-null  object 
 8   votes                 10925 non-null  float64
 9   type                  10925 non-null  object 
 11  language              10925 non-null  object 
 12  production_countries  10925 non-null  object 
 13  seasons               10925 non-null  float64
dtypes: float64(4), int64(1), object(9)
memory usage: 1.3+ MB
None


In [489]:
merged_data_cleaned

Unnamed: 0,title,year,certificate,duration_min,genre,rating,description,stars,votes,type,warnings,language,production_countries,seasons
8494,Breaking Bad,2008,TV-MA,48.0,"Drama, Comedy, Crime, Thriller",9.2,"When walter white, a new mexico chemistry teac...",Unknown,1936461.0,Show,Unknown,Unknown,US,5.0
9215,Stranger Things,2016,TV-14,61.0,"Drama, Scifi, Thriller, Fantasy, Horror",8.6,"When a young boy vanishes, a small town uncove...",Unknown,1220079.0,Show,Unknown,Unknown,US,5.0
8495,The Walking Dead,2010,TV-MA,46.0,"Action, Drama, Horror, Thriller",8.1,Sheriff's deputy rick grimes awakens from a co...,Unknown,1013253.0,Show,Unknown,Unknown,US,11.0
8978,Peaky Blinders,2013,TV-MA,58.0,"Drama, Crime, European",8.7,"A gangster family epic set in 1919 birmingham,...",Unknown,567949.0,Show,Unknown,Unknown,GB,6.0
8831,Black Mirror,2011,TV-MA,59.0,"Scifi, Drama, Thriller, European",8.6,A contemporary british re-working of the twili...,Unknown,560631.0,Show,Unknown,Unknown,GB,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7773,Blank Check,1994,UNKNOWN,0.0,Unknown,9.3,Unknown,Unknown,80.0,Unknown,Parental Guidance Suggested. May Not Be Suitab...,Unknown,UNKNOWN,0.0
7774,Heavyweights,1995,UNKNOWN,0.0,Unknown,7.4,Unknown,Unknown,80.0,Unknown,Some Rude Language And Pranks,Unknown,UNKNOWN,0.0
7777,D2: The Mighty Ducks,1994,UNKNOWN,0.0,Unknown,7.0,Unknown,Unknown,80.0,Unknown,Some Mild Language And Rough Hockey Action,Unknown,UNKNOWN,0.0
7778,"Honey, I Shrunk The Kids",1989,UNKNOWN,0.0,Unknown,8.0,Unknown,Unknown,80.0,Unknown,Parental Guidance Suggested. May Not Be Suitab...,Unknown,UNKNOWN,0.0


In [491]:
# Save the cleaned dataset to a CSV file
merged_data_cleaned.to_csv('merged_data.csv', index=False)