In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import klib
from datacleaner import autoclean
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")



In [2]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\ML_Lectures\netflix_titles.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [5]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [12]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [3]:
missing_percentage = df.isnull().mean() * 100
missing_percentage

show_id          0.000000
type             0.000000
title            0.000000
director        29.908028
cast             9.367549
country          9.435676
date_added       0.113546
release_year     0.000000
rating           0.045418
duration         0.034064
listed_in        0.000000
description      0.000000
dtype: float64

In [4]:
df.drop(columns=['show_id'], inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   date_added    8797 non-null   object
 6   release_year  8807 non-null   int64 
 7   rating        8803 non-null   object
 8   duration      8804 non-null   object
 9   listed_in     8807 non-null   object
 10  description   8807 non-null   object
dtypes: int64(1), object(10)
memory usage: 757.0+ KB


In [12]:
df.head(1)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [5]:
df['date_added'] = df['date_added'].astype(str)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [6]:
df['year'] = df['date_added'].dt.year
df['month'] = df['date_added'].dt.month 
df['day'] = df['date_added'].dt.day

In [7]:
df.drop(columns=['date_added'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   type          8807 non-null   object 
 1   title         8807 non-null   object 
 2   director      6173 non-null   object 
 3   cast          7982 non-null   object 
 4   country       7976 non-null   object 
 5   release_year  8807 non-null   int64  
 6   rating        8803 non-null   object 
 7   duration      8804 non-null   object 
 8   listed_in     8807 non-null   object 
 9   description   8807 non-null   object 
 10  year          8709 non-null   float64
 11  month         8709 non-null   float64
 12  day           8709 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 894.6+ KB


In [24]:
df['rating'].value_counts().nunique

<bound method IndexOpsMixin.nunique of rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64>

In [8]:
for col in df.columns:
  if df[col].dtype =='object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
      df[col].fillna(df[col].median(), inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   type          8807 non-null   object 
 1   title         8807 non-null   object 
 2   director      8807 non-null   object 
 3   cast          8807 non-null   object 
 4   country       8807 non-null   object 
 5   release_year  8807 non-null   int64  
 6   rating        8807 non-null   object 
 7   duration      8807 non-null   object 
 8   listed_in     8807 non-null   object 
 9   description   8807 non-null   object 
 10  year          8807 non-null   float64
 11  month         8807 non-null   float64
 12  day           8807 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 894.6+ KB


In [9]:
df['type'] = df['type'].map({'TV Show': 0, 'Movie': 1})

In [11]:
for col in df.columns:
  if df[col].dtype == 'object':
    cardinality = df[col].nunique()
    if cardinality >= 4:
      le = LabelEncoder()
      df[col] = le.fit_transform(df[col])
    else:
      df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   type          8807 non-null   int64  
 1   title         8807 non-null   int64  
 2   director      8807 non-null   int64  
 3   cast          8807 non-null   int64  
 4   country       8807 non-null   int64  
 5   release_year  8807 non-null   int64  
 6   rating        8807 non-null   int64  
 7   duration      8807 non-null   int64  
 8   listed_in     8807 non-null   int64  
 9   description   8807 non-null   int64  
 10  year          8807 non-null   float64
 11  month         8807 non-null   float64
 12  day           8807 non-null   float64
dtypes: float64(3), int64(10)
memory usage: 894.6 KB


In [13]:
df.head()

Unnamed: 0,type,title,director,cast,country,release_year,rating,duration,listed_in,description,year,month,day
0,1,1975,2295,1699,603,2020,7,210,274,2577,2021.0,9.0,25.0
1,0,1091,3392,409,426,2021,11,110,414,1762,2021.0,9.0,24.0
2,0,2651,2105,6296,603,2021,11,0,242,7341,2021.0,9.0,24.0
3,0,3506,3392,1699,603,2021,11,0,297,3617,2021.0,9.0,24.0
4,0,3861,3392,4815,251,2021,11,110,393,4416,2021.0,9.0,24.0


In [None]:
x = df.drop(columns=['type'])
y = df['type']

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)
df = pd.concat([x_scaled, y.reset_index(drop=True)], axis=1)
df.head()

Unnamed: 0,title,director,cast,country,release_year,rating,duration,listed_in,description,year,month,day,type
0,-0.955017,-0.285812,-0.882054,0.834435,0.65993,-1.540357,1.319459,0.010344,-0.714182,1.353813,0.686576,1.270434,1
1,-1.302726,0.628071,-1.466946,-0.091648,0.773324,0.503089,0.185089,1.075245,-1.035989,1.353813,0.686576,1.168663,0
2,-0.689123,-0.444096,1.202247,0.834435,0.773324,0.503089,-1.062718,-0.233062,1.166905,1.353813,0.686576,1.168663,0
3,-0.352822,0.628071,-0.882054,0.834435,0.773324,0.503089,-1.062718,0.185292,-0.303533,1.353813,0.686576,1.168663,0
4,-0.213188,0.628071,0.530755,-1.007266,0.773324,0.503089,0.185089,0.91551,0.011956,1.353813,0.686576,1.168663,0


In [16]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)


In [20]:
x_train.shape, x_test.shape, x_val.shape 

((7045, 12), (881, 12), (881, 12))

In [21]:
y_train.shape, y_test.shape, y_val.shape


((7045,), (881,), (881,))

In [22]:
model = LogisticRegression()
lr_model = model.fit(x_train, y_train)

In [23]:
y_pred = lr_model.predict(x_test)

In [28]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_score = cross_val_score(lr_model, x_train, y_train, cv=kf, scoring='neg_mean_squared_error')
cv_score = np.sqrt(-cv_score)
cv_score

array([0.44017546, 0.45756786, 0.4552353 , 0.44418808, 0.43693883])

In [30]:
c_report = classification_report(y_test, y_pred)
print(c_report)

              precision    recall  f1-score   support

           0       0.76      0.67      0.72       282
           1       0.85      0.90      0.88       599

    accuracy                           0.83       881
   macro avg       0.81      0.79      0.80       881
weighted avg       0.83      0.83      0.83       881

