In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
data.head() #show first 5 row of data

In [None]:
data.columns #show data column names

In [None]:
data.shape #show row and column counts

In [None]:
data.info() #show some information about data like column names, data tyoes, non-null value counts...

In [None]:
#frequency of country data
print(data["country"].value_counts(dropna=False))
#if there are non values, also show them 

In [None]:
data.describe() #ignore null values

In [None]:
data.boxplot(column='release_year',by='type')
plt.show
#black line at top is max
#blue line at top is 75%
#green line is median 50%
#blue line at bottom is 25%
#black line at bottom is min
#circles are outliers..

**TIDY DATA**

In [None]:
new_data=data.head()

In [None]:
new_data

In [None]:
melted=pd.melt(frame=new_data,id_vars='title',value_vars=['type','duration'])

In [None]:
melted

**PIVOTING DATA** 

reverse of melting

In [None]:
melted.pivot(index='title',columns='variable',values='value')

**Concatenating Data**

we can concatenate two different dataframes vertically or horizontally.

In [None]:
data1=data.head()
data2=data.tail() #we'll concatenate these dataframes.

In [None]:
conc_data=pd.concat([data1,data2],axis=0,ignore_index=True)
#concatenate data1 and data2 vertically

In [None]:
conc_data

In [None]:
d1=data['title'].head()
d2=data['release_year'].head()

In [None]:
c_data=pd.concat([d1,d2],axis=1)
#concatenate data1 and data2 horizontally

In [None]:
c_data

**DATA TYPES**

* Object(string)
* Integer
* Float
* Boolean
* Categorical

We can convert these data types to each other under suitable conditions.

In [None]:
data.dtypes

In [None]:
data['type'].unique()

In [None]:
data['type']=data['type'].astype('category')

In [None]:
data.dtypes

**MISSING DATA AND TESTING WITH ASSERT**

There may be data whose some features are not defined. This is called missing value.

There are two important questions here.

1) Doesn't this feature really exist?

2) Is data missing?

In this case, there are some options we can do.

* To continue with same data
* Drop them (The dropna() function is used.)
* Filling them as NaN (The fillna() function is used.)
* Filling them with test statistics like mean.

In [None]:
data.info()

In [None]:
data.isnull().sum() #How many nulls are in which column

In [None]:
data["director"].value_counts(dropna=False)
#There are 1969 NaN (missing) values

In [None]:
data.dropna(subset=['director'],axis=0,inplace=True)
#inplace=True means dont have to assign new variable

In [None]:
#check with assert statement
assert data["director"].notnull().all() # if it's true, return nothing
#if it's false, return error

# assert 1=1 return nothing
# assert 1=2 return error

In [None]:
data

In [None]:
data["cast"].fillna('empty',inplace=True)

In [None]:
#check with assert statement
assert data["cast"].notnull().all() # if it's true, return nothing
#if it's false, return error

# assert 1=1 return nothing
# assert 1=2 return error

In [None]:
data["cast"].value_counts(dropna=False)
#There isn't any NaN (missing) values

* Different way of create data frame:


In [None]:
Movie=["Automata","Good People"]
Date=[1995,2015]
list_label=["Movie","Date"]
list_col=[Movie,Date]
zipped=list(zip(list_label,list_col))
data_dict=dict(zipped)
df=pd.DataFrame(data_dict)
df

In [None]:
df["director"]=["henry","chriss"]
df["duration"]=0 #broadcasting
df

**Visual Exploratory Data Analysis**

In [None]:
data.head()

In [None]:
data1.describe()

In [None]:
data["int_duration"]=data["duration"].str.split(n=1,expand=True)[0]


In [None]:
data["int_duration"]=data["int_duration"].astype(int)

In [None]:
data1=data.loc[:,["release_year","int_duration"]]
data1.plot()

In [None]:
data1.plot(subplots=True)

In [None]:
data1.plot(kind="scatter",x="release_year",y="int_duration")
plt.show()

In [None]:
data1.plot(kind = "hist",y = "release_year",bins = 20,range= (1950,2040),density = True)

In [None]:
data1.plot(kind = "hist",y = "release_year",bins = 20,range= (1950,2040),density = True,cumulative=True)
#density for normalized, cumulative to sum the previous ones
plt.savefig('graph.png')

In [None]:
data1.describe()

**Indexing Pandas Time Series**

In [None]:
data=data.reset_index(drop=True)
data.head()

**Datetime / Time Series**

In [None]:
data["date_added"]

In [None]:
data["date_added"].value_counts(dropna=True)

In [None]:
# we'll convert date_added column to yyyy-mm-dd format
data["Month"]=data["date_added"].str.split(n=1,expand=True)[0]

In [None]:
data['Month']=["01" if i=="January" else "02" if i=="February" else "03" if i=="March" else "04" if i=="April" else "05" if i=="May" else "06" if i=="June" else "07" if i=="July" else "08" if i=="August" else "09" if i=="September" else "10" if i=="October" else "11" if i=="November" else "12" if i=="December" else "NaN" for i in data['Month']]

In [None]:
data['Month'].fillna("1", inplace = True) 


In [None]:
data["days"]=data["date_added"].str.split(n=1,expand=True)[1].str.split(',',n=1,expand=True)[0]

In [None]:
data["days"].fillna("1", inplace = True) 

In [None]:
data["year"]=data["date_added"].str.split(n=1,expand=True)[1].str.split(n=1,expand=True)[1]
data["year"].fillna("2000", inplace = True) 

In [None]:
data["date"]=data["year"]+"-"+data['Month']+"-"+data["days"]

In [None]:
data["date"]

In [None]:
del data["year"]
del data["Month"]
del data["days"]

In [None]:
#convert to datetime type
datetime_object=pd.to_datetime(data["date"])
datetime_object

In [None]:
dfm=data.copy()

In [None]:
dfm["date"]=datetime_object

In [None]:
dfm=dfm.set_index("date")  #time series


In [None]:
print(dfm.loc["2017-04-15"])

In [None]:
#if it was unique:
#dfm.loc["2019-08-30":"2017-04-15"]
#but this is non-unique

**resample**

* Resampling: statistical method over different time intervals
* Downsampling: reduce date time rows to slower frequency like from daily to weekly
* Upsampling: increase date time rows to faster frequency like from daily to hourly
* Interpolate: Interpolate values according to different methods like ‘linear’, ‘time’ or index’ 

In [None]:
#A=year, M =month
dfm.resample("A").mean()

In [None]:
dfm.resample("M").mean()

In [None]:
dfm.resample("M").mean().interpolate("linear")  #Fills intervals as linear between upper and lower value.

**Indexing, Slicing, Filtering and Transforming Data Frames**

In [None]:
data

In [None]:
#data["ind"]=0
#i=1
#while i<4266: 
#    data["ind"][i-1]=i 
#    i+=1
    
data["ind"]=range(1,4266,1)    

In [None]:
data.head()

In [None]:
data=data.set_index('ind')

In [None]:
data.head()

In [None]:
data["title"][1] #previous index was 0

In [None]:
data.title[1]

In [None]:
data.loc[1,"director"]

In [None]:
data[["date","title"]]

In [None]:
type(data["title"]) #series

In [None]:
type(data[["title"]]) #data frame

In [None]:
data.loc[1:10,"title":"cast"]

In [None]:
data.loc[10:1:-1,"title":"cast"]

In [None]:
data.loc[10:1:-1,"cast":]

In [None]:
data[data["release_year"]>2015]

In [None]:
first=data["release_year"]>2015 #first filter
second=data["type"]=="TV Show" #second fiter

data[first & second] #apply both

In [None]:
data.title[data.director=="Mariano Barroso"]

In [None]:
def inc(n):
    return n+1
data.show_id.apply(inc) #we can use "apply" for functions in data frames

In [None]:
data.show_id.apply(lambda n: n+1)
#data.show_id=...

In [None]:
# data["new_feature"]=data.int_duration+data.release_year 
#we can create a new feature using other columns.

In [None]:
print(data.index.name)

In [None]:
data.index.name="index"

In [None]:
data.head()

In [None]:
d=data.copy()
d.index=range(31,4296,1)
d.head()

In [None]:
d=d.set_index(["type","rating"])
d.head(10)

In [None]:
d.info()

In [None]:
d1=data.copy()  
d1=d1.loc[5:10,["release_year","duration","title"]]
d1.head()

In [None]:
d1.pivot(index="duration",columns="release_year",values="title")

In [None]:
d2=data.copy()
d2=d2.loc[60:145,["listed_in","date","title"]]
d2

In [None]:
d2 = d2.set_index(["title","listed_in"])
#d2 = d2.set_index(["title","listed_in"], append=True)
d2

In [None]:
# level determines indexes
d2.unstack(level=1) #if there are more than one index, this decrease it.

In [None]:
d2=d2.swaplevel(0,1)

In [None]:
d2

In [None]:
data.head()

In [None]:
pd.melt(data,id_vars="title",value_vars=["type","date"])

In [None]:
data.groupby("type").release_year.mean() 

In [None]:
data.groupby("int_duration").mean() 
#sum(),min(),max()..

In [None]:
data.groupby("type")[["release_year","int_duration"]].min() 