In [9]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

# Load and Explore Dataset
file_path = "/content/youtube_channel_real_performance_analytics.csv"
data = pd.read_csv(file_path)

print("\n--- Dataset Info ---")
data.info()
print("\nFirst 5 rows:")
print(data.head())
print("\nMissing values per column:")
print(data.isnull().sum())


--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 70 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  364 non-null    int64  
 1   Video Duration                      364 non-null    float64
 2   Video Publish Time                  364 non-null    object 
 3   Days Since Publish                  364 non-null    int64  
 4   Day                                 364 non-null    int64  
 5   Month                               364 non-null    int64  
 6   Year                                364 non-null    int64  
 7   Day of Week                         364 non-null    object 
 8   Revenue per 1000 Views (USD)        364 non-null    float64
 9   Monetized Playbacks (Estimate)      364 non-null    float64
 10  Playback-Based CPM (USD)            364 non-null    float64
 11  CPM (USD)              

In [10]:
# Data Cleaning
# Drop rows with NA for simplicity (adjust if needed)
data = data.dropna()
print(f"\nAfter cleaning, dataset shape: {data.shape}")

# Feature Engineering
data['Revenue per View'] = np.where(data['Views'] > 0, data['Estimated Revenue (USD)'] / data['Views'], 0)
data['Engagement Rate'] = np.where(
    data['Views'] > 0,
    ((data['Likes'] + data['Shares'] + data['New Comments']) / data['Views']) * 100,
    0
)
print("\nFeature engineering complete: added 'Revenue per View' and 'Engagement Rate'.")


After cleaning, dataset shape: (364, 70)

Feature engineering complete: added 'Revenue per View' and 'Engagement Rate'.


In [11]:
# Exploratory Data Analysis (EDA)
# Correlation Heatmap
corr_features = [
    'Estimated Revenue (USD)', 'Views', 'Subscribers',
    'Revenue per 1000 Views (USD)', 'Likes', 'Engagement Rate'
]
corr_matrix = data[corr_features].corr()

fig_corr = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale="RdBu",
    zmin=-1, zmax=1,
    colorbar=dict(title="Correlation")
))
fig_corr.update_layout(title="Correlation Heatmap of Key Metrics", height=600)
fig_corr.show()

# Top 10 videos by revenue
top_videos = data.sort_values(by="Estimated Revenue (USD)", ascending=False).head(10)
print("\n--- Top 10 Videos by Revenue ---")
print(top_videos[['ID', 'Estimated Revenue (USD)', 'Views', 'Subscribers']])



--- Top 10 Videos by Revenue ---
      ID  Estimated Revenue (USD)     Views  Subscribers
228  228                  103.117  670990.0       3538.0
257  257                   83.979  241060.0       1125.0
251  251                   80.265  343319.0       1437.0
289  289                   70.247   99196.0        350.0
278  278                   65.978  188324.0       1824.0
260  260                   62.047  302999.0        866.0
293  293                   59.058  101025.0        602.0
294  294                   55.040   67556.0        581.0
290  290                   50.344   89284.0        995.0
284  284                   44.228   93487.0        305.0


In [12]:
# Data Visualizations
# Revenue Distribution
fig_hist = px.histogram(
    data, x="Estimated Revenue (USD)", nbins=50,
    title="Distribution of Estimated Revenue",
    labels={"Estimated Revenue (USD)": "Revenue (USD)"},
    color_discrete_sequence=["green"]
)
fig_hist.show()

# Revenue vs Views
fig_scatter = px.scatter(
    data, x="Views", y="Estimated Revenue (USD)",
    title="Revenue vs Views",
    labels={"Estimated Revenue (USD)": "Revenue (USD)", "Views": "Video Views"},
    opacity=0.6
)
fig_scatter.show()

In [13]:
# Predictive Model: Estimate Revenue
features = [
    'Video Duration', 'Views', 'Subscribers', 'Likes',
    'Shares', 'New Comments', 'Engagement Rate',
    'Average View Duration', 'Impressions', 'Video Thumbnail CTR (%)'
]
target = "Estimated Revenue (USD)"

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [14]:
# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\n--- Model Performance ---")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.4f}")


--- Model Performance ---
Mean Squared Error: 76.25
Mean Absolute Error: 5.45
R² Score: 0.1242


In [15]:
# Feature Importance
importances = model.feature_importances_
feat_imp_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)

fig_imp = px.bar(
    feat_imp_df.head(10), x="Importance", y="Feature", orientation="h",
    title="Top 10 Feature Importances for Revenue Prediction",
    color="Importance", color_continuous_scale="Blues"
)
fig_imp.update_yaxes(autorange="reversed")
fig_imp.show()

print("\n--- Feature Importance Data ---")
print(feat_imp_df.head(10))



--- Feature Importance Data ---
                   Feature  Importance
2              Subscribers    0.212881
3                    Likes    0.112813
6          Engagement Rate    0.110101
8              Impressions    0.106751
7    Average View Duration    0.103869
9  Video Thumbnail CTR (%)    0.097925
5             New Comments    0.083434
1                    Views    0.071889
0           Video Duration    0.057814
4                   Shares    0.042523


In [16]:
# Deployment (Save Model)
joblib.dump(model, "youtube_revenue_predictor.pkl")
print("\nModel saved as 'youtube_revenue_predictor.pkl'")


Model saved as 'youtube_revenue_predictor.pkl'
