In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [41]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [43]:
df=pd.read_csv(r'C:\Users\admin\Documents\youtube_project\dataset\preprocessed.csv')

In [44]:
df.head()

Unnamed: 0.1,Unnamed: 0,views,likes,comments,watch_time_minutes,subscribers,category,device,country,ad_revenue_usd
0,0,9936,1221,320,26497.21,228086,Entertainment,TV,IN,203.178237
1,1,10017,642,346,15209.75,736015,Gaming,Tablet,CA,140.880508
2,2,10097,1979,187,57332.66,240534,Education,TV,CA,360.134008
3,3,10034,1191,242,31334.52,434482,Entertainment,Mobile,UK,224.638261
4,4,9889,1858,477,15665.67,42030,Education,Mobile,CA,165.514388


In [45]:
df=df.drop('Unnamed: 0',axis=1)

In [46]:
df.head()

Unnamed: 0,views,likes,comments,watch_time_minutes,subscribers,category,device,country,ad_revenue_usd
0,9936,1221,320,26497.21,228086,Entertainment,TV,IN,203.178237
1,10017,642,346,15209.75,736015,Gaming,Tablet,CA,140.880508
2,10097,1979,187,57332.66,240534,Education,TV,CA,360.134008
3,10034,1191,242,31334.52,434482,Entertainment,Mobile,UK,224.638261
4,9889,1858,477,15665.67,42030,Education,Mobile,CA,165.514388


In [47]:
df["ad_revenue_usd"] = df["ad_revenue_usd"].round(2)

In [48]:
X=df.drop('ad_revenue_usd',axis=1)
Y=df['ad_revenue_usd']

In [49]:
numarical_X=X.select_dtypes(exclude='object')
categorical_X=X.select_dtypes(include='object')

In [51]:
numarical_X

Unnamed: 0,views,likes,comments,watch_time_minutes,subscribers
0,9936,1221,320,26497.21,228086
1,10017,642,346,15209.75,736015
2,10097,1979,187,57332.66,240534
3,10034,1191,242,31334.52,434482
4,9889,1858,477,15665.67,42030
...,...,...,...,...,...
119995,9853,1673,147,42075.70,210818
119996,10128,1709,63,57563.70,878860
119997,10267,700,274,27549.71,576756
119998,10240,1616,106,56967.38,585138


In [50]:
categorical_X

Unnamed: 0,category,device,country
0,Entertainment,TV,IN
1,Gaming,Tablet,CA
2,Education,TV,CA
3,Entertainment,Mobile,UK
4,Education,Mobile,CA
...,...,...,...
119995,Education,Tablet,US
119996,Music,Desktop,UK
119997,Tech,Tablet,CA
119998,Music,Mobile,UK


In [52]:
cat_rank=[]
for i in categorical_X.columns:
    cat_rank.append(list(df.groupby(i)['ad_revenue_usd'].mean().sort_values(ascending=False).index))
cat_rank

[['Tech', 'Gaming', 'Education', 'Music', 'Entertainment', 'Lifestyle'],
 ['Mobile', 'Tablet', 'TV', 'Desktop'],
 ['US', 'CA', 'DE', 'UK', 'IN', 'AU']]

In [53]:
numerical_transformer=Pipeline([('imputer',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
categorical_transformer=Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                                 ('ordinal',OrdinalEncoder(categories=cat_rank))])

In [62]:
preprocessor=ColumnTransformer([('num_cont',numerical_transformer,numarical_X.columns),
                                 ('cat',categorical_transformer,categorical_X.columns)])
preprocessor

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [63]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((96000, 8), (24000, 8), (96000,), (24000,))

In [64]:
Lr_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('regressor',LinearRegression())])
Lr_pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [65]:
Lr_pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


##Linear Regreassion

In [66]:
y_train_pred=Lr_pipeline.predict(X_train)
y_test_pred=Lr_pipeline.predict(X_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9497047497620046
R2 score for test data 0.9525814749506836
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 3.2339840791837315
MAE for test data 3.081677897626452


##SCR regression

In [68]:
SVM_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                             ('regressor',SVR(kernel='linear'))])
SVM_pipeline.fit(X_train,y_train)





0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [69]:
y_train_pred=SVM_pipeline.predict(X_train)
y_test_pred=SVM_pipeline.predict(X_test)

In [70]:
print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9497000155002353
R2 score for test data 0.9525942059540804
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 3.1448421738157872
MAE for test data 2.9910499338183714


In [71]:
DT_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('regressor',DecisionTreeRegressor(max_depth=7,min_samples_split=10,min_samples_leaf=5))])
DT_pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,7
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [72]:
y_train_pred=DT_pipeline.predict(X_train)
y_test_pred=DT_pipeline.predict(X_test)

In [73]:
print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9465069315748803
R2 score for test data 0.9485825511464501
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 5.832555531432713
MAE for test data 5.752003352709912


In [74]:
RF_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('regressor',RandomForestRegressor(n_estimators=10, max_depth=7, random_state=42))])
RF_pipeline.fit(X_train,y_train)



0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num_cont', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Tech', 'Gaming', ...], ['Mobile', 'Tablet', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,7
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [75]:
y_train_pred=RF_pipeline.predict(X_train)
y_test_pred=RF_pipeline.predict(X_test)



In [76]:
print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9479336724284011
R2 score for test data 0.9501777311168451
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 5.254262275786208
MAE for test data 5.152965032793174


In [77]:
XGB_pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                            ('regressor',XGBRegressor(n_estimators=600,max_depth=4,min_child_weight=200,max_leaves=10))])
XGB_pipeline.fit(X_train,y_train)

y_train_pred=XGB_pipeline.predict(X_train)
y_test_pred=XGB_pipeline.predict(X_test)

print('Evaluation:')
print('R2 score for train data',r2_score(y_train,y_train_pred))
print('R2 score for test data',r2_score(y_test,y_test_pred))
print('---'*50)
print('MAE for train data',mean_absolute_error(y_train,y_train_pred))
print('MAE for test data',mean_absolute_error(y_test,y_test_pred))

Evaluation:
R2 score for train data 0.9530196509050148
R2 score for test data 0.9510850985171152
------------------------------------------------------------------------------------------------------------------------------------------------------
MAE for train data 4.231513343763352
MAE for test data 4.29511617322286


In [67]:
with open(r'C:\Users\admin\Documents\youtube_project\youtube_notebook\youtube_revenue_model.pkl','wb') as f:
    pickle.dump(Lr_pipeline,f)