In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
netflix_df = pd.read_csv("netflix.csv")

In [36]:
netflix_df.head(5)

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83,2020
1,"To Each, Her Own",Romantic comedy,French,5.3,6/24/2018,95,2018
2,The Lovebirds,Romantic comedy,English,6.1,5/22/2020,87,2020
3,The Perfection,Horror-thriller,English,6.1,5/24/2019,90,2019
4,Happy Anniversary,Romantic comedy,English,5.8,3/30/2018,78,2018


In [None]:
# 1. Indexing with loc (Access by Labels)

In [None]:
# Select the row where the title is "Notes for My Son"

In [11]:
row = netflix_df.loc[netflix_df['title'] == "Notes for My Son"]
row 

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83,2020


In [None]:
# Select the first 3 rows and columns 'title' and 'genre'

In [17]:
first_three = netflix_df.loc[:2, ['title','genre']]
first_three

Unnamed: 0,title,genre
0,Notes for My Son,Drama
1,"To Each, Her Own",Romantic comedy
2,The Lovebirds,Romantic comedy


In [32]:
# Select rows where the genre is "Drama" and IMDb score > 7

drama_high = netflix_df.loc[(netflix_df['genre'] == 'Drama') & (netflix_df['imdb_score'] > 7)]
drama_high.head(2)

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
18,Pieces of a Woman,Drama,English,7.1,1/7/2021,126,2021
52,The Trial of the Chicago 7,Drama,English,7.8,10/16/2020,130,2020


In [None]:
# 2. Indexing with iloc (Access by Integer Position)
# Task 1: Select a specific row by position.


In [35]:
# Select the 5th row

netflix_df.iloc[4]

title         Happy Anniversary
genre           Romantic comedy
language                English
imdb_score                  5.8
premiere              3/30/2018
runtime                      78
year                       2018
Name: 4, dtype: object

In [39]:
# Select the first 5 rows and first 3 columns

netflix_df.iloc[0:5, :3]

Unnamed: 0,title,genre,language
0,Notes for My Son,Drama,Spanish
1,"To Each, Her Own",Romantic comedy,French
2,The Lovebirds,Romantic comedy,English
3,The Perfection,Horror-thriller,English
4,Happy Anniversary,Romantic comedy,English


In [40]:
# Select rows 1, 3, and 5, and columns 0 (title) and 2 (language)

selected_rows = netflix_df.iloc[[1,3,5],[0,2]]
selected_rows

Unnamed: 0,title,language
1,"To Each, Her Own",French
3,The Perfection,English
5,Why Did You Kill Me?,English


In [47]:
# 3. Counting with count()

# Task 1: Count non-null entries in each column

nonnull_columns = netflix_df.count()
nonnull_columns

title         583
genre         583
language      583
imdb_score    583
premiere      583
runtime       583
year          583
dtype: int64

In [49]:
# Count non-null entries in the 'imdb_score' column

imdb_non_null = netflix_df['title'].count()
imdb_non_null

583

In [52]:
# Count rows where runtime > 100 minutes

run_time =  netflix_df[netflix_df['runtime'] > 100].count()  
run_time 

title         232
genre         232
language      232
imdb_score    232
premiere      232
runtime       232
year          232
dtype: int64

In [54]:
# 4. Counting with size
# Task 1: Count total entries in the DataFrame

total_entries = netflix_df.size
total_entries

4081

In [55]:
# Total entries in the 'genre' column

genre_total = netflix_df['genre'].size
genre_total

583

In [56]:
# 5. Counting with value_counts()
# Task 1: Count unique values in a column

# Count occurrences of each genre

each_genre = netflix_df['genre'].value_counts()
each_genre

genre
Documentary                    159
Drama                           77
Comedy                          49
Romantic comedy                 39
Thriller                        33
                              ... 
Political thriller               1
Fantasy                          1
Romantic comedy-drama            1
Animation/Musical/Adventure      1
Supernatural drama               1
Name: count, Length: 114, dtype: int64

In [None]:
# NULL Handling
# Task 1: Detect missing values using isna() or isnull()
# Check for missing values in the entire dataset

In [58]:
missing_values = netflix_df.isnull()
missing_values.head(3)

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False


In [60]:
# Check for missing values in a specific column

missing_genre = netflix_df['genre'].isnull()
missing_genre.head(3)

0    False
1    False
2    False
Name: genre, dtype: bool

In [62]:
# Task 3: Detect existing (non-missing) values using notnull()
# Get rows where 'imdb_score' is not missing

rows_not_missing = netflix_df['imdb_score'].notnull()
rows_not_missing 

0      True
1      True
2      True
3      True
4      True
       ... 
578    True
579    True
580    True
581    True
582    True
Name: imdb_score, Length: 583, dtype: bool

In [65]:
# Unique Values
# Task 1: Find unique values including NULL using unique()

# Unique genres

unique_genres =  netflix_df['genre'].unique()
unique_genres

array(['Drama', 'Romantic comedy', 'Horror-thriller', 'Documentary',
       'Comedy', 'Comedy / Musical', 'Animation / Short', 'Action',
       'Crime thriller', 'Horror', 'Romance', 'Thriller', 'Action/Comedy',
       'War', 'Concert Film', 'Variety show', 'Comedy-drama',
       'Musical/Western/Fantasy', 'Heist', 'Western',
       'Action/Science fiction', 'Dark comedy', 'Family film',
       'Action comedy', 'Romantic teen drama', 'One-man show',
       'Animation', 'Drama / Short', 'Action-thriller', 'Romantic drama',
       'Biopic', 'Crime drama', 'Heist film/Thriller', 'Science fiction',
       'Coming-of-age comedy-drama', 'Comedy/Horror',
       'Science fiction/Thriller', 'Psychological thriller drama',
       'Spy thriller', 'Sports film', 'Comedy horror',
       'Romantic comedy/Holiday', 'Romantic teenage drama',
       'Superhero/Action', 'Adventure', 'Science fiction thriller',
       'Musical / Short', 'Science fiction/Drama', 'Family',
       'Aftershow / Interview', '

In [66]:
# Task 2: Find the number of unique values excluding NULL using nunique()
# Number of unique languages

unique_languages = netflix_df['language'].unique()
unique_languages

array(['Spanish', 'French', 'English', 'Portuguese', 'English/Mandarin',
       'English/Spanish', 'German', 'Italian', 'Korean', 'Thia/English',
       'Hindi', 'Malay', 'Japanese', 'Marathi', 'Swedish', 'Indonesian',
       'Dutch', 'Filipino', 'Spanish/English',
       'English/Taiwanese/Mandarin', 'Georgian', 'English/Hindi',
       'English/Russian', 'Spanish/Catalan', 'English/Ukranian/Russian',
       'Tamil', 'Norwegian', 'Turkish', 'English/Arabic', 'Polish',
       'English/Swedish', 'Bengali', 'English/Japanese', 'Thai',
       'English/Korean', 'Khmer/English/French', 'English/Akan',
       'Spanish/Basque'], dtype=object)

In [97]:
# Duplicate Handling
# Task 1: Check for duplicates using duplicated()
# Identify duplicate rows

duplicated_rows = netflix_df.duplicated().sum()
duplicated_rows 

0

In [95]:
# Task 2: Remove duplicate rows using drop_duplicates()
# Remove duplicate rows

drop_duplicated = netflix_df.drop_duplicates()
drop_duplicated

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83,2020
1,"To Each, Her Own",Romantic comedy,French,5.3,6/24/2018,95,2018
2,The Lovebirds,Romantic comedy,English,6.1,5/22/2020,87,2020
3,The Perfection,Horror-thriller,English,6.1,5/24/2019,90,2019
4,Happy Anniversary,Romantic comedy,English,5.8,3/30/2018,78,2018
...,...,...,...,...,...,...,...
578,Rolling Thunder Revue: A bob Dylan Story by Ma...,Documentary,English,7.6,6/12/2019,144,2019
579,Freaks: You're One of Us,Supernatural drama,German,5.4,9/2/2020,92,2020
580,Squared Love,Romantic comedy,Polish,5.0,2/11/2021,102,2021
581,My Own Man,Documentary,English,6.4,12/13/2014,81,2014


In [75]:
# Value Checking
# Task 1: Check if values exist in a collection using isin()


# Filter rows where 'genre' is either "Drama" or "Comedy"

genre_filter = netflix_df[netflix_df['genre'].isin(['Drama', 'Comedy'])]
genre_filter.head(2)

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83,2020
6,Death to 2020,Comedy,English,6.8,12/27/2020,70,2020


In [79]:
# Task 2: Check if values are within a range using between()
# Filter rows where 'imdb_score' is between 6 and 8

imdb_score_range = netflix_df[netflix_df['imdb_score'].between(6,8)]
imdb_score_range.head(3)

Unnamed: 0,title,genre,language,imdb_score,premiere,runtime,year
0,Notes for My Son,Drama,Spanish,6.3,11/24/2020,83,2020
2,The Lovebirds,Romantic comedy,English,6.1,5/22/2020,87,2020
3,The Perfection,Horror-thriller,English,6.1,5/24/2019,90,2019


In [84]:
# Type Conversion with astype()
# Task 1: Convert a column to a specific data type
# Convert the 'year' column to string type

netflix_df['year'] = netflix_df['year'].astype(int)
netflix_df.dtypes

title          object
genre          object
language       object
imdb_score    float64
premiere       object
runtime         int64
year            int64
dtype: object

In [None]:
# Convert 'runtime' to float and 'imdb_score' to integer (for demonstration)

# netflix_df = netfix_df.astype( {'imdb_score' : 'float', 'imdb_score': 'int'}  )

In [None]:
# Value Replacement with replace()
# Task 1: Replace specific values in a column.
# Replace "Drama" with "Dramatic Movie" in the 'genre' column

In [86]:
netflix_df['genre'] = netflix_df['genre'].replace('Drama', 'Dramatic Movie')
netflix_df['genre']

0          Dramatic Movie
1         Romantic comedy
2         Romantic comedy
3         Horror-thriller
4         Romantic comedy
              ...        
578           Documentary
579    Supernatural drama
580       Romantic comedy
581           Documentary
582           Documentary
Name: genre, Length: 583, dtype: object

In [87]:
netflix_df['genre'] = netflix_df['genre'].replace('Dramatic Movie', 'Drama')
netflix_df['genre']

0                   Drama
1         Romantic comedy
2         Romantic comedy
3         Horror-thriller
4         Romantic comedy
              ...        
578           Documentary
579    Supernatural drama
580       Romantic comedy
581           Documentary
582           Documentary
Name: genre, Length: 583, dtype: object

In [None]:
# Task 2: Replace multiple values at once.
    # Replace "Drama" with "Dramatic Movie" and "Comedy" with "Funny Movie"

In [91]:
# Replace "Drama" with "Dramatic Movie" and "Comedy" with "Funny Movie"
# netflix_data['genre'] = netflix_data['genre'].replace({'Drama': 'Dramatic Movie', 'Comedy': 'Funny Movie'})

In [93]:
# Replace IMDb scores below 5 with "Low" and above 8 with "High"