In [370]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [371]:
df = pd.read_csv('Netflix Data new.csv')
df.head()

Unnamed: 0,N_id,Title,Main Genre,Sub Genres,Release Year,Maturity Rating,Original Audio,Recommendations
0,215309,Ace Ventura: Pet Detective,Comedy,"Comedy, Mystery, US",1994.0,A,"Hindi, English [Original]","70184054, 60001650, 70112729, 70027007, 115246..."
1,215318,Ace Ventura: When Nature Calls,Comedy,"Comedy, Action & Adventure, US",1995.0,U/A 16+,"Hindi, English [Original]","70184054, 60001650, 70112729, 70027007, 115246..."
2,217258,The Addams Family,Comedy,"Comedy, US",1991.0,U/A 13+,"English [Original], Hindi, English - Audio Des...","81156676, 81231974, 70027007, 80049939, 702179..."
3,217303,Addams Family Values,Comedy,"Comedy, US",1993.0,U/A 13+,"English [Original], Hindi, English - Audio Des...","81156676, 70044593, 81231974, 70027007, 800500..."
4,235527,Agneepath,Drama,"Hindi-Language, Bollywood, Crime, Drama",1990.0,U/A 16+,Hindi [Original],"17517355, 80158546, 80158395, 80074065, 702042..."


In [372]:
df = df.drop(columns=['Recommendations', 'Original Audio'])

In [373]:
df.head()

Unnamed: 0,N_id,Title,Main Genre,Sub Genres,Release Year,Maturity Rating
0,215309,Ace Ventura: Pet Detective,Comedy,"Comedy, Mystery, US",1994.0,A
1,215318,Ace Ventura: When Nature Calls,Comedy,"Comedy, Action & Adventure, US",1995.0,U/A 16+
2,217258,The Addams Family,Comedy,"Comedy, US",1991.0,U/A 13+
3,217303,Addams Family Values,Comedy,"Comedy, US",1993.0,U/A 13+
4,235527,Agneepath,Drama,"Hindi-Language, Bollywood, Crime, Drama",1990.0,U/A 16+


In [374]:
df.isnull().sum()

N_id               0
Title              0
Main Genre         0
Sub Genres         0
Release Year       1
Maturity Rating    0
dtype: int64

In [375]:
df.shape

(6403, 6)

In [376]:
df = df.dropna(subset=['Release Year'])

In [377]:
df.isnull().sum()

N_id               0
Title              0
Main Genre         0
Sub Genres         0
Release Year       0
Maturity Rating    0
dtype: int64

In [378]:
df.head()

Unnamed: 0,N_id,Title,Main Genre,Sub Genres,Release Year,Maturity Rating
0,215309,Ace Ventura: Pet Detective,Comedy,"Comedy, Mystery, US",1994.0,A
1,215318,Ace Ventura: When Nature Calls,Comedy,"Comedy, Action & Adventure, US",1995.0,U/A 16+
2,217258,The Addams Family,Comedy,"Comedy, US",1991.0,U/A 13+
3,217303,Addams Family Values,Comedy,"Comedy, US",1993.0,U/A 13+
4,235527,Agneepath,Drama,"Hindi-Language, Bollywood, Crime, Drama",1990.0,U/A 16+


In [379]:
df['Maturity Rating'].value_counts()

Maturity Rating
U/A 16+    2053
A          1858
U/A 13+    1557
U/A 7+      685
U           249
Name: count, dtype: int64

In [380]:
df['Maturity Rating'] = df['Maturity Rating'].str.replace('U/A', '')

In [381]:
df['Maturity Rating'].value_counts()

Maturity Rating
 16+    2053
A       1858
 13+    1557
 7+      685
U        249
Name: count, dtype: int64

In [382]:
df['Maturity Rating'] = df['Maturity Rating'].str.replace('+', '')

In [383]:
df['Maturity Rating'].value_counts()

Maturity Rating
 16    2053
A      1858
 13    1557
 7      685
U       249
Name: count, dtype: int64

In [384]:
df['Maturity Rating'] = df['Maturity Rating'].str.replace('A', '18')

In [385]:
df['Maturity Rating'].value_counts()

Maturity Rating
 16    2053
18     1858
 13    1557
 7      685
U       249
Name: count, dtype: int64

In [386]:
df['Maturity Rating'] = df['Maturity Rating'].str.replace('U', '0')

In [387]:
df['Maturity Rating'].value_counts()

Maturity Rating
 16    2053
18     1858
 13    1557
 7      685
0       249
Name: count, dtype: int64

In [388]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6402 entries, 0 to 6402
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   N_id             6402 non-null   int64  
 1   Title            6402 non-null   object 
 2   Main Genre       6402 non-null   object 
 3   Sub Genres       6402 non-null   object 
 4   Release Year     6402 non-null   float64
 5   Maturity Rating  6402 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 350.1+ KB


In [389]:
df['Maturity Rating'] = df['Maturity Rating'].astype(float)

In [390]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6402 entries, 0 to 6402
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   N_id             6402 non-null   int64  
 1   Title            6402 non-null   object 
 2   Main Genre       6402 non-null   object 
 3   Sub Genres       6402 non-null   object 
 4   Release Year     6402 non-null   float64
 5   Maturity Rating  6402 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 350.1+ KB


In [391]:
df.head()

Unnamed: 0,N_id,Title,Main Genre,Sub Genres,Release Year,Maturity Rating
0,215309,Ace Ventura: Pet Detective,Comedy,"Comedy, Mystery, US",1994.0,18.0
1,215318,Ace Ventura: When Nature Calls,Comedy,"Comedy, Action & Adventure, US",1995.0,16.0
2,217258,The Addams Family,Comedy,"Comedy, US",1991.0,13.0
3,217303,Addams Family Values,Comedy,"Comedy, US",1993.0,13.0
4,235527,Agneepath,Drama,"Hindi-Language, Bollywood, Crime, Drama",1990.0,16.0


In [392]:
df['Release Year'] = df['Release Year'].astype(float)

In [393]:
X = df.drop(columns=['Maturity Rating'])
y = df['Maturity Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [394]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', OneHotEncoder(drop='first'),[1, 2, 3] )
])

In [395]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.fit_transform(X_test)

In [396]:
model = LinearRegression()
model.fit(X_train, y_train)

In [397]:
X_test.astype(int)

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 3736 stored elements and shape (1281, 2219)>

In [398]:
X_train.astype(int)

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 14955 stored elements and shape (5121, 7857)>

In [399]:
model.predict(X_test)

ValueError: X has 2219 features, but LinearRegression is expecting 7857 features as input.

In [400]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

df = pd.read_csv('Netflix Data new.csv')
df = df.drop(columns=['Recommendations', 'Original Audio'])
df = df.dropna(subset=['Release Year'])

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(categories=[['U', 'U/A 7+', 'U/A 13+', 'U/A 16+', 'A']])
df['Maturity Rating Ordinal'] = ordinal_encoder.fit_transform(df[['Maturity Rating']])

# One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_genres = onehot_encoder.fit_transform(df[['Main Genre']])
genre_columns = onehot_encoder.get_feature_names_out(['Main Genre'])
df[genre_columns] = encoded_genres

# Prepare data for Linear Regression
X = df.drop(['N_id', 'Title', 'Main Genre', 'Sub Genres', 'Maturity Rating'], axis=1) # Drop non-numerical and original categorical columns
y = df['Release Year']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions


print(X_train.shape)
print(X_test.shape)

(5121, 22)
(1281, 22)


In [401]:
y_pred = model.predict(X_test)[5]
print(y_pred)

2023.0


In [403]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('Netflix Data new.csv')
df = df.drop(columns=['Recommendations', 'Original Audio'])
df = df.dropna(subset=['Release Year'])

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(categories=[['U', 'U/A 7+', 'U/A 13+', 'U/A 16+', 'A']])
df['Maturity Rating Ordinal'] = ordinal_encoder.fit_transform(df[['Maturity Rating']])

# One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_genres = onehot_encoder.fit_transform(df[['Main Genre']])
genre_columns = onehot_encoder.get_feature_names_out(['Main Genre'])
df[genre_columns] = encoded_genres

# Prepare data for Linear Regression
X = df.drop(['N_id', 'Title', 'Main Genre', 'Sub Genres', 'Maturity Rating'], axis=1)
y = df['Release Year']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test) # Corrected line: ensure model.predict() output is assigned to y_pred

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 7.54697377768424e-27
R-squared: 1.0


In [405]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('Netflix Data new.csv')
df = df.drop(columns=['Recommendations', 'Original Audio'])
df = df.dropna(subset=['Release Year'])

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(categories=[['U', 'U/A 7+', 'U/A 13+', 'U/A 16+', 'A']])
df['Maturity Rating Ordinal'] = ordinal_encoder.fit_transform(df[['Maturity Rating']])

# One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_genres = onehot_encoder.fit_transform(df[['Main Genre']])
genre_columns = onehot_encoder.get_feature_names_out(['Main Genre'])
df[genre_columns] = encoded_genres

# Prepare data for Linear Regression
X = df.drop(['N_id', 'Title', 'Main Genre', 'Sub Genres', 'Maturity Rating'], axis=1)
y = df['Release Year']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred1 = model.predict(X_test) # Corrected line: ensure model.predict() output is assigned to y_pred

# Evaluate the model
mse = mean_squared_error(y_test, y_pred1)
r2 = r2_score(y_test, y_pred1)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 52.11007025761124
R-squared: -0.24602491519628678


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [406]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 52.11007025761124
R-squared: -0.24602491519628678
