## **Analysis Netflix Dataset**

### Analysis of the Netflix dataset with the questions created and visualization by finding the answers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/netflixoriginals/NetflixOriginals.csv",encoding='ISO-8859-1')
df = data.copy()
df

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
runtime = df.iloc[:,3:4]
imdb = df.iloc[:,4:5]
premiere = df.iloc[:,2:3]

### Distributions

In [None]:
plt.figure(figsize = (8,5))
sns.set(style = "whitegrid")
sns.distplot(runtime)
plt.title("Distribution of Runtime", fontsize = 20)
plt.xlabel("Runtime", fontsize = 15)
plt.ylabel("Count", fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(df["IMDB Score"])
plt.title("Count of IMDB Score", fontsize = 20)
plt.xlabel("IMDB Score", fontsize = 15)
plt.ylabel("Count", fontsize = 15)
plt.show()

### 1-) Veri setine göre uzun soluklu filmler hangi dilde oluşturulmuştur? Görselleştirme yapınız.
### (In which language were the long-running films created according to the dataset? Make a visualization)
 

In [None]:
q = df[(df['Runtime'] > 40)]
runtime_q = q.iloc[:,3:4]
language_q = q.iloc[:,-1]
first_q = pd.concat([runtime_q, language_q], axis=1)
first_q.sort_values("Runtime", ascending=False).reset_index(drop=True)

In [None]:
#Visualization
plt.figure(figsize = (17,25))
sns.barplot(data=first_q, x=first_q["Runtime"], y=first_q["Language"])
plt.title("Languages for Long Runtime", fontsize = 20)
plt.xlabel("Runtime", fontsize = 20)
plt.ylabel("Language", fontsize = 20)
plt.show()

### 2 -) 2019 Ocak ile 2020 Haziran tarihleri arasında 'Documentary' türünde çekilmiş filmlerin IMDB değerlerini bulup görselleştiriniz.  
### (Find and visualize the IMDB values of the movies shot in the 'Documentary' genre between January 2019 and June 2020.)

In [None]:
df["Premiere"] = pd.to_datetime(df["Premiere"])
df["Premiere"]

In [None]:
second_q = df[(df["Genre"] == "Documentary") & (df['Premiere'] > '2019-01') & (df['Premiere'] < '2020-06')]
second_q

In [None]:
doc_imdb = second_q.iloc[:,4:5]
doc_title = second_q.iloc[:,0:1]
q2 = pd.concat([doc_imdb,doc_title], axis=1)

new_q2 = pd.DataFrame(data = q2 ,columns=["IMDB Score","Title"])
new_q2 = new_q2.sort_values("IMDB Score", ascending=False).reset_index(drop=True)
new_q2

In [None]:
#Visualization
plt.figure(figsize = (17,25))
sns.barplot(x=new_q2["IMDB Score"],y=new_q2["Title"])
plt.title("IMDB Score", fontsize = 20)
plt.xlabel("IMDB Score", fontsize = 20)
plt.xticks(np.arange(1,10,1))
plt.ylabel("Title", fontsize = 20)
plt.show()

### 3-) İngilizce çekilen filmler içerisinde hangi tür en yüksek IMDB puanına sahiptir?
### (Which genre has the highest IMDB rating among movies shot in English?)

In [None]:
e_q3 = df[(df["Language"] == "English")]

g_q3 = e_q3.iloc[:,1:2]
imdb_q3 = e_q3.iloc[:,4:6]

cnct_q3 = pd.concat([g_q3, imdb_q3],axis=1)
q3 = cnct_q3.sort_values("IMDB Score", ascending=False)
q3 = q3.reset_index(drop=True)
q3

In [None]:
q3.iloc[0,:]
#->Documentary

### 4-) 'Hindi' Dilinde çekilmiş olan filmlerin ortalama 'runtime' suresi nedir?
### (What is the average 'runtime' of movies shot in 'Hindi'?)

In [None]:
hindi_q4 = df[(df["Language"] == "Hindi")]
q4 = hindi_q4.iloc[:,3:4]
q4.sort_values("Runtime").mean()
#sağlaması
#q4.describe()

### 5-) Veri setinde bulunan filmlerde en çok kullanılan 3 dili bulunuz.
### (Find the 3 most used languages in the movies in the data set.)

In [None]:
q6 = df["Language"]
q6.value_counts().head(3)

### 6-) IMDB puanı en yüksek olan ilk 10 film hangileridir?
### (What are the top 10 movies with the highest IMDB rating?)

In [None]:
title_q7 = df.iloc[:,0:1]
title_q7
imdb_q7 = df.iloc[:,4:5]
imdb_q7

cnct_q7 = pd.concat([title_q7, imdb_q7], axis=1)
cnct_q7.sort_values("IMDB Score", ascending=False).head(10).reset_index(drop=True)

### 7-) IMDB puanı ile 'Runtime' arasında nasıl bir korelasyon vardır? İnceleyip görselleştiriniz.
### (What is the correlation between IMDB score and 'Runtime'? Examine and visualize.)

In [None]:
data = df[['Runtime','IMDB Score']]
data.plot('IMDB Score','Runtime')
correlation = data.corr(method='pearson')
correlation

### 8-) IMDB Puanı en yüksek olan ilk 10 'Genre' hangileridir? Görselleştiriniz.
### (Which are the top 10 'Genre' with the highest IMDB Score? Visualize it.)

In [None]:
genre_q9 = df.iloc[:,1:2]
imdb_q9 = df.iloc[:,4:5]

cnct_q9 = pd.concat([genre_q9, imdb_q9], axis=1)
q9 = cnct_q9.sort_values("IMDB Score", ascending=False).reset_index(drop=True).head(10)
q9

In [None]:
#Visualization
plt.figure(figsize=(8,5))
sns.barplot(data=q9, x=q9["IMDB Score"], y=q9["Genre"])
plt.title("Top 10 IMDB Score of Genre", fontsize=20)
plt.xlabel("IMDB Score", fontsize=15)
plt.ylabel("Genre", fontsize=15)
plt.xticks(np.arange(1,11,1))
plt.show()

### 9-) 'Runtime' değeri en yüksek olan ilk 10 film hangileridir? Görselleştiriniz.
### (What are the top 10 movies with the highest 'runtime'? Visualize it.)

In [None]:
#split
title_q10 = df.iloc[:,0:1]
rntm_q10 = df.iloc[:,3:4]

#concat
cnct_q10 = pd.concat([title_q10, rntm_q10], axis = 1)
q10 = cnct_q10.sort_values("Runtime", ascending=False).reset_index(drop=True).head(10)
q10


In [None]:
#Visualization
plt.figure(figsize=(8,5))
sns.barplot(data=q10, x=q10["Runtime"], y=q10["Title"])
plt.title("Top 10 Long Runtime Movies", fontsize=20)
plt.xlabel("Runtime", fontsize=15)
plt.ylabel("Title", fontsize=15)
plt.show()

### 10-) Hangi yılda en fazla film yayımlanmıştır? Görselleştiriniz
### (In which year was the most movies released? visualize)

In [None]:
year = pd.DatetimeIndex(df['Premiere']).year
year = pd.DataFrame(year)

year_cnts = year["Premiere"].value_counts()
year_cnts = pd.DataFrame(year_cnts)
year_cnts["Count"] = year_cnts["Premiere"]
del year_cnts["Premiere"]
year_cnts

year_np = np.arange(2014,2022,1)
year_np = pd.DataFrame(year_np, index=range(2014,2022),columns=["Years"])

final = pd.concat([year_np, year_cnts], axis=1)

q11 = final.sort_values("Count", ascending=False).reset_index(drop=True)
q11

In [None]:
#Visualization
plt.figure(figsize = (8,5))
sns.barplot(x=q11["Years"], y=q11["Count"])
plt.title("Most Premiere Movie on Year", fontsize=20)
plt.xlabel("Years", fontsize=15)
plt.ylabel("Count of Movie", fontsize=15)
plt.show()