# NETFLIX DATAFRAME

1. Data Cleaning and Formatting
2. Data Aggregation and Filtering
3. Data Structuring and Combining Data

## Import library & import csv

In [2]:
# Import all libraries. 

import pandas as pd
import numpy as np
import re
import seaborn as sns 
import matplotlib.pyplot as plt

In [3]:
# Import the csv.

netflix = pd.read_csv("/Users/roraimachavez/Downloads/7.IRONHACK/Projects/data-wrangling-project/src/netflix_titles.csv")

## General information

In [4]:
netflix #General info of the DataFrame

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [5]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


`Rows: 8807`

`Columns: 12`

## Data cleaning & formatting

1. Edit column names.
2. Delete columns I won't use.
3. Sort last 10 years.
4. Delete duplicate rows.
5. Delete nulls values.
6. Tranform columns types if it is neccesary.
7. Add a column for "plataform"

`Edit column names & Delete columns I won't use.`

In [6]:
# Edit column names.

netflix = netflix.rename(columns=lambda x: x.replace('_', ' '))

# Change column name. 

netflix.rename(columns={'listed in': 'genres'}, inplace=True)

# Delete columns I won't use.

netflix = netflix.drop(["show id", "date added", "rating", "duration", "description"], axis=1)

In [7]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   release year  8807 non-null   int64 
 6   genres        8807 non-null   object
dtypes: int64(1), object(6)
memory usage: 481.8+ KB


`Delete rows & Filter data since 2014 to 2024 (10 years).`

In [8]:
# Delete rows. I'm only using data since 2014 to 2024 (10 years).
netflix_filter = netflix.loc[(netflix['release year'] > 2013) & (netflix['release year'] <= 2024)] 
netflix = netflix_filter.copy() #assign the new dataframe to the old one

In [9]:
netflix.shape

(6568, 7)

`Delete duplicade data.`

In [10]:
# Delete duplicates rows.
netflix_drop_duplicates = netflix.drop_duplicates()
netflix = netflix_drop_duplicates.copy()

In [11]:
netflix.shape # There wasn't any duplicates rows

(6568, 7)

`Check null values.`

In [12]:
# Check how many nulls values do we have.
num_nans = netflix.isna().sum() 
num_nans

type               0
title              0
director        2241
cast             746
country          729
release year       0
genres             0
dtype: int64

In [13]:
# Chance null values.
for col in netflix.columns:
    netflix[col].fillna("not found", inplace=True)

In [14]:
num_nans = netflix.isna().sum() 
num_nans

type            0
title           0
director        0
cast            0
country         0
release year    0
genres          0
dtype: int64

`Add a column for name the plataform.`

In [15]:
# Add a column for "plataform"
netflix['platform'] = 'Netflix'

In [16]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6568 entries, 0 to 8806
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          6568 non-null   object
 1   title         6568 non-null   object
 2   director      6568 non-null   object
 3   cast          6568 non-null   object
 4   country       6568 non-null   object
 5   release year  6568 non-null   int64 
 6   genres        6568 non-null   object
 7   platform      6568 non-null   object
dtypes: int64(1), object(7)
memory usage: 461.8+ KB


In [17]:
netflix.head()

Unnamed: 0,type,title,director,cast,country,release year,genres,platform
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,not found,United States,2020,Documentaries,Netflix
1,TV Show,Blood & Water,not found,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021,"International TV Shows, TV Dramas, TV Mysteries",Netflix
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",not found,2021,"Crime TV Shows, International TV Shows, TV Act...",Netflix
3,TV Show,Jailbirds New Orleans,not found,not found,not found,2021,"Docuseries, Reality TV",Netflix
4,TV Show,Kota Factory,not found,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021,"International TV Shows, Romantic TV Shows, TV ...",Netflix


## Genre filtre

In [18]:
# Make columns per genres.
dfnet = pd.concat([
    netflix, 
    netflix['genres'].str.get_dummies(sep=',')[[
        'Romantic TV Shows','TV Comedies','TV Dramas','TV Horror','TV Sci-Fi & Fantasy',
        'TV Action & Adventure','Anime Features','Children & Family Movies','Reality TV',
        'Comedies','Documentaries','Dramas','Romantic Movies','Thrillers',
        'Horror Movies','Action & Adventure','Music & Musicals','Sci-Fi & Fantasy']]
], axis=1).drop(columns=['genres'])

In [19]:
# Convert columns names to in lower case
dfnet.columns = dfnet.columns.str.lower()

In [20]:
dfnet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6568 entries, 0 to 8806
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   type                      6568 non-null   object
 1   title                     6568 non-null   object
 2   director                  6568 non-null   object
 3   cast                      6568 non-null   object
 4   country                   6568 non-null   object
 5   release year              6568 non-null   int64 
 6   platform                  6568 non-null   object
 7   romantic tv shows         6568 non-null   int64 
 8   tv comedies               6568 non-null   int64 
 9   tv dramas                 6568 non-null   int64 
 10  tv horror                 6568 non-null   int64 
 11  tv sci-fi & fantasy       6568 non-null   int64 
 12  tv action & adventure     6568 non-null   int64 
 13  anime features            6568 non-null   int64 
 14  children & family movies  656

In [21]:
# Create new columns that is the combination of 2.

dfnet['romance'] = dfnet['romantic tv shows'] | dfnet['romantic movies']
dfnet.drop(columns=['romantic tv shows', 'romantic movies'], inplace=True)

dfnet['comedy'] = dfnet['tv comedies'] | dfnet['comedies']
dfnet.drop(columns=['tv comedies', 'comedies'], inplace=True)

dfnet['drama'] = dfnet['tv dramas'] | dfnet['dramas']
dfnet.drop(columns=['tv dramas', 'dramas'], inplace=True)

dfnet['horror'] = dfnet['tv horror'] | dfnet['horror movies'] 
dfnet.drop(columns=['tv horror', 'horror movies'], inplace=True)

dfnet['sci-fi & fantasy2'] = dfnet['tv sci-fi & fantasy'] | dfnet['sci-fi & fantasy']
dfnet.drop(columns=['tv sci-fi & fantasy', 'sci-fi & fantasy'], inplace=True)

dfnet['action & adventure2'] = dfnet['tv action & adventure'] | dfnet['action & adventure']
dfnet.drop(columns=['tv action & adventure', 'action & adventure'], inplace=True)

In [22]:
# Change column name. 

dfnet.rename(columns={'sci-fi & fantasy2': 'sci-fi & fantasy'}, inplace=True)
dfnet.rename(columns={'action & adventure2': 'action & adventure'}, inplace=True)
dfnet.rename(columns={'anime features': 'animation'}, inplace=True)
dfnet.rename(columns={'documentaries': 'documentary'}, inplace=True)
dfnet.rename(columns={'children & family movies': 'children & family'}, inplace=True)
dfnet.rename(columns={'reality tv': 'entertainment'}, inplace=True)

In [23]:
dfnet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6568 entries, 0 to 8806
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   type                6568 non-null   object
 1   title               6568 non-null   object
 2   director            6568 non-null   object
 3   cast                6568 non-null   object
 4   country             6568 non-null   object
 5   release year        6568 non-null   int64 
 6   platform            6568 non-null   object
 7   animation           6568 non-null   int64 
 8   children & family   6568 non-null   int64 
 9   entertainment       6568 non-null   int64 
 10  documentary         6568 non-null   int64 
 11  thrillers           6568 non-null   int64 
 12  music & musicals    6568 non-null   int64 
 13  romance             6568 non-null   int64 
 14  comedy              6568 non-null   int64 
 15  drama               6568 non-null   int64 
 16  horror              6568 non-

In [24]:
# Rename new columns.
dfnet.rename(columns={
    'horror movies': 'horror', 
    'romantic movies': 'romantic',
    'romantic tv shows': 'tv romantic',
    'children & family movies': 'children & family',
    'anime features': 'anime',
    'reality tv': 'tv reality',
    'anime features': 'anime',
    'comedies': 'comedy',
    'documentaries': 'documentary',
    'dramas': 'drama',
    'romantic': 'romance'},
    inplace=True)

In [25]:
# Check null values. 
num_nans = dfnet.isna().sum() 
num_nans

type                  0
title                 0
director              0
cast                  0
country               0
release year          0
platform              0
animation             0
children & family     0
entertainment         0
documentary           0
thrillers             0
music & musicals      0
romance               0
comedy                0
drama                 0
horror                0
sci-fi & fantasy      0
action & adventure    0
dtype: int64

In [26]:
# Save DataFrame
dfnet.to_csv('netflix.csv', index=True)