## Cleaning and Merging the datasets

In [3]:
# Import necessary libraries
import pandas as pd

In [8]:
# Load the datasets using raw string notation to avoid escape character issues
data1 = pd.read_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan1.csv')
data2 = pd.read_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan2.csv')
data3 = pd.read_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan3.csv')

In [10]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

Data 1:
                    title  year certificate  duration_min  \
0               Cobra Kai  2018       TV-14          30.0   
1               The Crown  2016       TV-MA          58.0   
2        Better Call Saul  2015       TV-MA          46.0   
3           Devil in Ohio  2022       TV-MA         356.0   
4  Cyberpunk: Edgerunners  2022       TV-MA          24.0   

                          genre  rating  \
0         Action, Comedy, Drama     8.5   
1     Biography, Drama, History     8.7   
2                  Crime, Drama     8.9   
3        Drama, Horror, Mystery     5.9   
4  Animation, Action, Adventure     8.6   

                                         description  \
0  Decades after their 1984 All Valley Karate Tou...   
1  Follows the political rivalries and romance of...   
2  The trials and tribulations of criminal lawyer...   
3  When a psychiatrist shelters a mysterious cult...   
4  A Street Kid trying to survive in a technology...   

                             

In [12]:
# Convert the 'year' column to a datetime object if it's not already for data3
data3['year'] = pd.to_datetime(data3['year'], errors='coerce')  # Coerce invalid formats to NaT

In [14]:
# Extract only the year part
data3['year'] = data3['year'].dt.year.astype(str)

In [16]:
# Save the cleaned data3 for verification if needed
data3.to_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan3.csv', index=False)

In [18]:
# Preview the cleaned year column
print(data3[['title', 'year']].head())

               title  year
0   Notes for My Son  2020
1   To Each, Her Own  2018
2      The Lovebirds  2020
3     The Perfection  2019
4  Happy Anniversary  2018


In [20]:
# Transform the 'rating score' column for data2
data2['rating'] = (data2['user rating score'] / 10).round(1)

In [22]:
# Drop the old 'user rating score' column if no longer needed
data2 = data2.drop(columns=['user rating score'])

In [24]:
# Save the updated dataset for verification if needed
data2.to_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan2.csv', index=False)

In [26]:
# Preview the new rate column
print(data2[['title', 'rating']].head())

                   title  rating
0           White Chicks     8.2
1    Lucky Number Slevin     8.1
2         Grey's Anatomy     9.8
3           Prison Break     9.8
4  How I Met Your Mother     9.4


In [28]:
# Rename the column 'imdb_score' to 'rating' from data 3
data3.rename(columns={'imdb_score': 'rating'}, inplace=True)

In [30]:
# Save the updated dataset for verification if needed
data3.to_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan3.csv', index=False)

In [32]:
# Preview the new rate column
print(data3.head())

               title            genre language  rating    premiere  runtime  \
0   Notes for My Son            Drama  Spanish     6.3  11/24/2020       83   
1   To Each, Her Own  Romantic comedy   French     5.3   6/24/2018       95   
2      The Lovebirds  Romantic comedy  English     6.1   5/22/2020       87   
3     The Perfection  Horror-thriller  English     6.1   5/24/2019       90   
4  Happy Anniversary  Romantic comedy  English     5.8   3/30/2018       78   

   year  
0  2020  
1  2018  
2  2020  
3  2019  
4  2018  


In [40]:
# Rename the column 'release year' to 'year' from data 2
data2.rename(columns={'release year': 'year'}, inplace=True)

In [42]:
# Save the updated dataset for verification if needed
data2.to_csv(r'C:\Users\saman\Downloads\FP_BA_Netflix\clean_datan2.csv', index=False)

In [44]:
# Preview the new year column
print(data2.head())

                   title  rating  \
0           White Chicks     8.2   
1    Lucky Number Slevin     8.1   
2         Grey's Anatomy     9.8   
3           Prison Break     9.8   
4  How I Met Your Mother     9.4   

                                         ratingLevel  ratingDescription  year  \
0  crude and sexual humor, language and some drug...                 80  2004   
1  strong violence, sexual content and adult lang...                100  2006   
2  Parents strongly cautioned. May be unsuitable ...                 90  2016   
3  Parents strongly cautioned. May be unsuitable ...                 90  2008   
4  Parental guidance suggested. May not be suitab...                 70  2014   

   user rating size  
0                80  
1                82  
2                80  
3                80  
4                80  


In [46]:
# Display the first few rows of each dataset to understand their structure
print("Data 1:")
print(data1.head())

print("\nData 2:")
print(data2.head())

print("\nData 3:")
print(data3.head())

Data 1:
                    title  year certificate  duration_min  \
0               Cobra Kai  2018       TV-14          30.0   
1               The Crown  2016       TV-MA          58.0   
2        Better Call Saul  2015       TV-MA          46.0   
3           Devil in Ohio  2022       TV-MA         356.0   
4  Cyberpunk: Edgerunners  2022       TV-MA          24.0   

                          genre  rating  \
0         Action, Comedy, Drama     8.5   
1     Biography, Drama, History     8.7   
2                  Crime, Drama     8.9   
3        Drama, Horror, Mystery     5.9   
4  Animation, Action, Adventure     8.6   

                                         description  \
0  Decades after their 1984 All Valley Karate Tou...   
1  Follows the political rivalries and romance of...   
2  The trials and tribulations of criminal lawyer...   
3  When a psychiatrist shelters a mysterious cult...   
4  A Street Kid trying to survive in a technology...   

                             

In [50]:
# Check for missing values
print("Data 1:")
print(data1.isnull().sum())

print("\nData 2:")
print(data2.isnull().sum())

print("\nData 3:")
print(data3.isnull().sum())

Data 1:
title           0
year            0
certificate     0
duration_min    0
genre           0
rating          0
description     0
stars           0
votes           0
dtype: int64

Data 2:
title                0
rating               0
ratingLevel          0
ratingDescription    0
year                 0
user rating size     0
dtype: int64

Data 3:
title       0
genre       0
language    0
rating      0
premiere    0
runtime     0
year        0
dtype: int64


In [52]:
# Step 1: Merge Dataset 1 and Dataset 2 on 'title' , 'year' and 'rating'
merged_1_2 = pd.merge(data1, data2, on=['title', 'year' , 'rating'], how='outer', suffixes=('_data1', '_data2'))

In [54]:
# Convert 'year' column to int in both DataFrames to ensure they are of the same type
merged_1_2['year'] = merged_1_2['year'].astype(int)  # Convert to int
data3['year'] = data3['year'].astype(int)  # Convert to int

In [56]:
# Step 2: Merge the result with Dataset 3 on 'title' , 'year' , 'rating' and 'genre'
final_merged_data = pd.merge(merged_1_2, data3, on=['title', 'year', 'rating', 'genre'], how='outer', suffixes=('', '_data3'))

In [58]:
# Save the final merged dataset
output_path = r'C:\Users\saman\Downloads\FP_BA_Netflix\final_merged_data.csv'
final_merged_data.to_csv(output_path, index=False)

In [60]:
# Preview the new dataset
print(final_merged_data.head())

                           title  year certificate  duration_min  \
0                       #ABtalks  2018       TV-MA      73.77162   
1                         #Alive  2020       TV-MA      98.00000   
2  #AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       #BlackAF  2020       TV-MA      36.00000   
4            #FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  #ABtalks is an interview show hosted by Anas B...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                   

In [62]:
# Check for missing values
print(final_merged_data.isnull().sum())

title                    0
year                     0
certificate           1092
duration_min          1092
genre                  498
rating                   0
description           1092
stars                 1092
votes                 1092
ratingLevel          10550
ratingDescription    10550
user rating size     10550
language             10412
premiere             10412
runtime              10412
dtype: int64


In [64]:
# Remove '#' from the 'title' column in the merged dataset
final_merged_data['title'] = final_merged_data['title'].str.replace('#', '', regex=False)

In [66]:
# Save the updated dataset
output_path = r'C:\Users\saman\Downloads\FP_BA_Netflix\final_merged_data.csv'
final_merged_data.to_csv(output_path, index=False)

In [68]:
# Preview the new dataset
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      73.77162   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  #ABtalks is an interview show hosted by Anas B...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

In [72]:
# Remove '#' from the 'description' column in the merged dataset
final_merged_data['description'] = final_merged_data['description'].str.replace('#', '', regex=False)

In [74]:
# Save the updated dataset
output_path = r'C:\Users\saman\Downloads\FP_BA_Netflix\final_merged_data.csv'
final_merged_data.to_csv(output_path, index=False)

In [76]:
# Preview the new dataset
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      73.77162   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  ABtalks is an interview show hosted by Anas Bu...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

## Exploratory Data Analysis (EDA)

In [79]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
# Check the first few rows of the data
print(final_merged_data.head())

                          title  year certificate  duration_min  \
0                       ABtalks  2018       TV-MA      73.77162   
1                         Alive  2020       TV-MA      98.00000   
2  AnneFrank - Parallel Stories  2019       TV-MA      92.00000   
3                       BlackAF  2020       TV-MA      36.00000   
4            FriendButMarried 2  2020       TV-14     100.00000   

                      genre  rating  \
0                    Comedy     9.1   
1     Action, Drama, Horror     6.3   
2            Drama, History     6.5   
3                    Comedy     6.7   
4  Biography, Comedy, Drama     6.9   

                                         description  \
0  ABtalks is an interview show hosted by Anas Bu...   
1  The rapid spread of an unknown infection has l...   
2  It is based on five women who did survive the ...   
3  A father takes an irreverent and honest approa...   
4  Ayudia (Mawar De Jongh) is not satisfied enoug...   

                         

In [83]:
# Get a summary of the dataframe
print(final_merged_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11053 entries, 0 to 11052
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              11053 non-null  object 
 1   year               11053 non-null  int32  
 2   certificate        9961 non-null   object 
 3   duration_min       9961 non-null   float64
 4   genre              10555 non-null  object 
 5   rating             11053 non-null  float64
 6   description        9961 non-null   object 
 7   stars              9961 non-null   object 
 8   votes              9961 non-null   float64
 9   ratingLevel        503 non-null    object 
 10  ratingDescription  503 non-null    float64
 11  user rating size   503 non-null    float64
 12  language           641 non-null    object 
 13  premiere           641 non-null    object 
 14  runtime            641 non-null    float64
dtypes: float64(6), int32(1), object(8)
memory usage: 1.2+ MB
None


In [85]:
# Describe the numerical columns
print(final_merged_data.describe())

               year  duration_min        rating         votes  \
count  11053.000000   9961.000000  11053.000000  9.961000e+03   
mean    1900.670678     73.784461      6.799129  1.951831e+04   
std      467.768037     44.843359      1.165370  8.224157e+04   
min        0.000000      1.000000      1.700000  5.000000e+00   
25%     2014.000000     46.000000      6.200000  3.580000e+02   
50%     2018.000000     73.771620      6.764515  1.745000e+03   
75%     2020.000000     93.000000      7.600000  1.506200e+04   
max     2025.000000    990.000000      9.900000  1.844075e+06   

       ratingDescription  user rating size     runtime  
count         503.000000        503.000000  641.000000  
mean           68.880716         80.966203   93.371295  
std            31.453464          0.998430   28.071160  
min            10.000000         80.000000    4.000000  
25%            41.000000         80.000000   86.000000  
50%            70.000000         80.000000   97.000000  
75%            

In [87]:
# Check for missing values
print(final_merged_data.isnull().sum())

title                    0
year                     0
certificate           1092
duration_min          1092
genre                  498
rating                   0
description           1092
stars                 1092
votes                 1092
ratingLevel          10550
ratingDescription    10550
user rating size     10550
language             10412
premiere             10412
runtime              10412
dtype: int64


In [89]:
# Check for duplicates
print(final_merged_data.duplicated().sum())

58


In [93]:
# Find duplicate rows
duplicates = final_merged_data[final_merged_data.duplicated()]

In [95]:
# Display duplicate rows
print(duplicates)

                                                  title  year certificate  \
46                                                 13th  2016         NaN   
63                                                 1922  2017         NaN   
122                                    5 Star Christmas  2018         NaN   
249                                       A Secret Love  2020         NaN   
298                                            AK vs AK  2020         NaN   
665                                    Army of the Dead  2021         NaN   
979                                 Beasts of No Nation  2015         NaN   
1997                                    Crazy About Her  2021         NaN   
2040                            Cuba and the Cameraman   2017         NaN   
2099             Dance Dreams: Hot Chocolate Nutcracker  2020         NaN   
2189           David Attenborough: A Life on Our Planet  2020         NaN   
2288                         Deidra & Laney Rob a Train  2017         NaN   