In [4]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Fetch dataset and save to the data folder
dataset_name = "octopusteam/full-netflix-dataset"  # Replace with your Kaggle dataset identifier
save_path = r"C:\Users\Shashank Reddy\Desktop\Project\Data"

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Download and unzip the dataset
api.dataset_download_files(dataset_name, path=save_path, unzip=True)

print(f"Dataset downloaded and saved in {save_path}")


Dataset URL: https://www.kaggle.com/datasets/octopusteam/full-netflix-dataset
Dataset downloaded and saved in C:\Users\Shashank Reddy\Desktop\Project\Data


In [6]:
# Importing essential packages
import pandas as pd  # For data cleaning and manipulation
import numpy as np  # For numerical computations

# Visualization libraries
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns  # For enhanced statistical visualizations

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Configuring visualization settings
plt.style.use('ggplot')  # Set a popular plotting style
sns.set_theme(style="whitegrid")  # Set seaborn theme

In [7]:
df = pd.read_csv(r"C:\Users\Shashank Reddy\Desktop\Project\Data\data.csv")
df

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,519811.0,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1230147.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,212975.0,"AD, AE, AG, AO, BH, BM, BR, BS, BZ, CI, CM, CO..."
3,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,447093.0,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,tt0338013,8.3,1115716.0,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."
...,...,...,...,...,...,...,...,...
20598,,tv,,2024.0,,,,KR
20599,,tv,,,,,,KR
20600,,tv,"Drama, Sci-Fi & Fantasy",2024.0,,,,TH
20601,,tv,,2022.0,,,,"PH, SG"


Data Cleaning

In [11]:
print(df.isnull().sum())

title                  577
type                     0
genres                 318
releaseYear             24
imdbId                1438
imdbAverageRating     1616
imdbNumVotes          1616
availableCountries       0
dtype: int64


In [13]:
# Remove all rows with null values
df = df.dropna()

# Display the updated DataFrame info to confirm
print(df.info())

# Optionally display the first few rows to ensure it's cleaned
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 18987 entries, 0 to 20585
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               18987 non-null  object 
 1   type                18987 non-null  object 
 2   genres              18987 non-null  object 
 3   releaseYear         18987 non-null  float64
 4   imdbId              18987 non-null  object 
 5   imdbAverageRating   18987 non-null  float64
 6   imdbNumVotes        18987 non-null  float64
 7   availableCountries  18987 non-null  object 
dtypes: float64(3), object(5)
memory usage: 1.3+ MB
None
                                   title   type                     genres  \
0                      The Fifth Element  movie  Action, Adventure, Sci-Fi   
1                      Kill Bill: Vol. 1  movie    Action, Crime, Thriller   
2                                Jarhead  movie      Biography, Drama, War   
3                             Unforgiv

In [15]:
print(df.isnull().sum())

title                 0
type                  0
genres                0
releaseYear           0
imdbId                0
imdbAverageRating     0
imdbNumVotes          0
availableCountries    0
dtype: int64


In [17]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Number of duplicate rows: 0


In [19]:
df

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,519811.0,"AT, CH, DE"
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1230147.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CI, CM, CZ..."
2,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,212975.0,"AD, AE, AG, AO, BH, BM, BR, BS, BZ, CI, CM, CO..."
3,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,447093.0,"AU, BA, BG, CZ, HR, HU, MD, ME, MK, NZ, PL, RO..."
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,tt0338013,8.3,1115716.0,"AD, AE, AG, AL, AO, AR, AU, AZ, BA, BB, BE, BG..."
...,...,...,...,...,...,...,...,...
20570,S.E.R.E.G.,tv,Action,2024.0,tt31242061,4.7,5525.0,HU
20573,The Later Daters,tv,"Reality-TV, Romance",2024.0,tt34599590,6.9,320.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."
20577,The Kings of Tupelo: A Southern Crime Saga,tv,"Crime, Documentary",2024.0,tt34682275,6.6,1279.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."
20582,UniverXO Dabiz,tv,Documentary,2024.0,tt34682889,5.8,80.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."


In [21]:
df = df.drop(['availableCountries', 'imdbId'], axis=1)

# Display the first few rows to confirm
df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbAverageRating,imdbNumVotes
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,7.6,519811.0
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,8.2,1230147.0
2,Jarhead,movie,"Biography, Drama, War",2005.0,7.0,212975.0
3,Unforgiven,movie,"Drama, Western",1992.0,8.2,447093.0
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,8.3,1115716.0


In [23]:
# Rename the columns for better readability
df.columns = ['Title', 'Type', 'Genres', 'Year', 'Rating', 'Votes']

# Display the updated DataFrame
df


Unnamed: 0,Title,Type,Genres,Year,Rating,Votes
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,7.6,519811.0
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,8.2,1230147.0
2,Jarhead,movie,"Biography, Drama, War",2005.0,7.0,212975.0
3,Unforgiven,movie,"Drama, Western",1992.0,8.2,447093.0
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,8.3,1115716.0
...,...,...,...,...,...,...
20570,S.E.R.E.G.,tv,Action,2024.0,4.7,5525.0
20573,The Later Daters,tv,"Reality-TV, Romance",2024.0,6.9,320.0
20577,The Kings of Tupelo: A Southern Crime Saga,tv,"Crime, Documentary",2024.0,6.6,1279.0
20582,UniverXO Dabiz,tv,Documentary,2024.0,5.8,80.0


In [25]:
# Split the 'Genres' column into multiple columns
genres_split = df['Genres'].str.split(', ', expand=True)

# Rename the new columns for clarity
genres_split.columns = [f'Genre_{i+1}' for i in range(genres_split.shape[1])]

# Combine the original DataFrame with the new genre columns
df = pd.concat([df, genres_split], axis=1)

# Drop the original 'Genres' column (optional)
# df.drop('Genres', axis=1, inplace=True)

# Display the updated DataFrame
df.head()


Unnamed: 0,Title,Type,Genres,Year,Rating,Votes,Genre_1,Genre_2,Genre_3,Genre_4
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,7.6,519811.0,Action,Adventure,Sci-Fi,
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,8.2,1230147.0,Action,Crime,Thriller,
2,Jarhead,movie,"Biography, Drama, War",2005.0,7.0,212975.0,Biography,Drama,War,
3,Unforgiven,movie,"Drama, Western",1992.0,8.2,447093.0,Drama,Western,,
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,8.3,1115716.0,Drama,Romance,Sci-Fi,


In [27]:
# Convert the 'Year' column to integer
df['Year'] = df['Year'].astype('Int64')

# Display the updated DataFrame
df.head()


Unnamed: 0,Title,Type,Genres,Year,Rating,Votes,Genre_1,Genre_2,Genre_3,Genre_4
0,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997,7.6,519811.0,Action,Adventure,Sci-Fi,
1,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003,8.2,1230147.0,Action,Crime,Thriller,
2,Jarhead,movie,"Biography, Drama, War",2005,7.0,212975.0,Biography,Drama,War,
3,Unforgiven,movie,"Drama, Western",1992,8.2,447093.0,Drama,Western,,
4,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004,8.3,1115716.0,Drama,Romance,Sci-Fi,


In [29]:
# Define the path for saving the transformed dataset
save_path = r"C:\Users\Shashank Reddy\Desktop\Project\Data\transformed_dataset.csv"

# Save the dataset to the specified folder
df.to_csv(save_path, index=False)

# Optional: Save as Excel
# save_path_excel = r"C:\Users\Shashank Reddy\Desktop\Project\Data\transformed_dataset.xlsx"
# df.to_excel(save_path_excel, index=False)

print(f"Transformed dataset saved successfully at {save_path}")



Transformed dataset saved successfully at C:\Users\Shashank Reddy\Desktop\Project\Data\transformed_dataset.csv
