# Netflix Userbase Data Exploration

### This Script Contains the Following Points:
#### 1. Importing Libraries & Data
#### 2. Data Exploration
#### 3. Data Wrangling

## 1. Importing Libraries & Data

In [5]:
#Import libraries
import pandas as pd
import numpy as np
import os

#Create Folder path to data
path = r'/Users/C SaiVishwanath/Desktop/Pet Project/Netflix'

#Import Airbnb data as 'df'
df=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'Netflix Userbase.csv'))

In [7]:
# Check output
df.head(5)

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device,Plan Duration
0,1,Basic,10,15-01-22,10-06-23,United States,28,Male,Smartphone,1 Month
1,2,Premium,15,05-09-21,22-06-23,Canada,35,Female,Tablet,1 Month
2,3,Standard,12,28-02-23,27-06-23,United Kingdom,42,Male,Smart TV,1 Month
3,4,Standard,12,10-07-22,26-06-23,Australia,51,Female,Laptop,1 Month
4,5,Basic,10,01-05-23,28-06-23,Germany,33,Male,Smartphone,1 Month


## 2. Data Exploration

In [10]:
# Print dimensions
df.shape

(2500, 10)

In [12]:
# Print basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   User ID            2500 non-null   int64 
 1   Subscription Type  2500 non-null   object
 2   Monthly Revenue    2500 non-null   int64 
 3   Join Date          2500 non-null   object
 4   Last Payment Date  2500 non-null   object
 5   Country            2500 non-null   object
 6   Age                2500 non-null   int64 
 7   Gender             2500 non-null   object
 8   Device             2500 non-null   object
 9   Plan Duration      2500 non-null   object
dtypes: int64(3), object(7)
memory usage: 195.4+ KB


In [14]:
# All columns have 2500 entries that are non-null, meaning they have a valid value and are not missing

In [18]:
# Print descriptive statistics for numeric columns
df.describe()

Unnamed: 0,User ID,Monthly Revenue,Age
count,2500.0,2500.0,2500.0
mean,1250.5,12.5084,38.7956
std,721.83216,1.686851,7.171778
min,1.0,10.0,26.0
25%,625.75,11.0,32.0
50%,1250.5,12.0,39.0
75%,1875.25,14.0,45.0
max,2500.0,15.0,51.0


In [20]:
# User ID column does not need to be included as numeric

## 3. Data Wrangling

In [28]:
# df
df['Plan Duration'].value_counts()

Plan Duration
1 Month    2500
Name: count, dtype: int64

In [34]:
# All entries have the value '1 Month' in the 'Plan Duration' column, so this can be dropped
# Drop 'Plan Duration' column
df=df.drop('Plan Duration', axis=1)

In [40]:
# Check output
df.head(5)

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device
0,1,Basic,10,15-01-22,10-06-23,United States,28,Male,Smartphone
1,2,Premium,15,05-09-21,22-06-23,Canada,35,Female,Tablet
2,3,Standard,12,28-02-23,27-06-23,United Kingdom,42,Male,Smart TV
3,4,Standard,12,10-07-22,26-06-23,Australia,51,Female,Laptop
4,5,Basic,10,01-05-23,28-06-23,Germany,33,Male,Smartphone


In [50]:
# Check unique values for User IDs
df['User ID'].value_counts()

User ID
1       1
1671    1
1664    1
1665    1
1666    1
       ..
834     1
835     1
836     1
837     1
2500    1
Name: count, Length: 2500, dtype: int64

In [60]:
# Change data type for User ID column
df['User ID'] = df['User ID'].astype(str)

In [62]:
df.dtypes

User ID              object
Subscription Type    object
Monthly Revenue       int64
Join Date            object
Last Payment Date    object
Country              object
Age                   int64
Gender               object
Device               object
dtype: object

In [64]:
from datetime import datetime, date

In [68]:
# Convert 'Date' column to datetime
df['Join Date'] = pd.to_datetime(df['Join Date'])

  df['Join Date'] = pd.to_datetime(df['Join Date'])


In [70]:
df.head(5)

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device
0,1,Basic,10,2022-01-15,10-06-23,United States,28,Male,Smartphone
1,2,Premium,15,2021-05-09,22-06-23,Canada,35,Female,Tablet
2,3,Standard,12,2023-02-28,27-06-23,United Kingdom,42,Male,Smart TV
3,4,Standard,12,2022-10-07,26-06-23,Australia,51,Female,Laptop
4,5,Basic,10,2023-01-05,28-06-23,Germany,33,Male,Smartphone


In [72]:
df.dtypes

User ID                      object
Subscription Type            object
Monthly Revenue               int64
Join Date            datetime64[ns]
Last Payment Date            object
Country                      object
Age                           int64
Gender                       object
Device                       object
dtype: object

In [74]:
# Convert 'Last Payment Date' column to datetime
df['Last Payment Date'] = pd.to_datetime(df['Last Payment Date'])

  df['Last Payment Date'] = pd.to_datetime(df['Last Payment Date'])


In [76]:
df.head(5)

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device
0,1,Basic,10,2022-01-15,2023-10-06,United States,28,Male,Smartphone
1,2,Premium,15,2021-05-09,2023-06-22,Canada,35,Female,Tablet
2,3,Standard,12,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV
3,4,Standard,12,2022-10-07,2023-06-26,Australia,51,Female,Laptop
4,5,Basic,10,2023-01-05,2023-06-28,Germany,33,Male,Smartphone


In [78]:
df.dtypes

User ID                      object
Subscription Type            object
Monthly Revenue               int64
Join Date            datetime64[ns]
Last Payment Date    datetime64[ns]
Country                      object
Age                           int64
Gender                       object
Device                       object
dtype: object

In [80]:
# Export cleaned dataframe to Prepared Data folder 
df.to_csv(os.path.join(path, '02 Data','Prepared Data', 'neftlix_userbase_clean.csv'))