In this notebook which look at which features had the largest impact in our linear model. Here, we re-define our features slightly to be all categorical. We then run t-tests (on the training data only) to see if those features have significantly higher views or significantly higher engagement. 

As a final conclusion: Videos that contain keywords from the "korean" keyword group seem to have significantly more views.
Videos that mention a popular brand or contain at least one hashtag decrease engagement. 
Videos that contain keywords from the "comparing_products" or "products" keyword group decrease engagement, whereas videos with keywords from the "self_ref" keyword group increase engagement

In [73]:
import pandas as pd

df = pd.read_csv('../data/new/no_early_dates_all_features_train.csv')


In [74]:
df.columns

Index(['Unnamed: 0', 'commentsCount', 'isChannelVerified', 'likes',
       'numberOfSubscribers', 'text', 'title', 'viewCount',
       'views_per_subscriber', 'duration_in_seconds', 'date',
       'hashtag_indicator', 'has_any_affiliate', 'hasAdinTitle', 'hasAdinText',
       'Engagement_per_Subscriber', 'Engagement_per_View', 'popular_brand',
       'prime_hour', 'product', 'skills/teach', 'speed', 'comparing_products',
       'self_ref', 'budget', 'korean'],
      dtype='object')

In [75]:
import pandas as pd
import numpy as np


#Changing hashtags to a categorical variable 
df["hashtag_indicator"] = 1 * df["hashtag_indicator"].astype(bool)

features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "korean", "speed", "skills/teach", "comparing_products", "prime_hour", "hasAdinTitle", "hasAdinText",'hashtag_indicator']

#Create the target column $y$ here
df["y"] = (df["likes"] + df["commentsCount"])  / (df["viewCount"] + 1)
df["y2"] = (df["viewCount"]) / ( df["numberOfSubscribers"] + 1) 

#get rid of noisy columns
df = df[ features + ["y"] + ["y2"] ]

In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Split data into exploration set and confirmation
#df_explore, df_confirm = train_test_split(df, test_size=0.5, random_state=42)

#print(f"Exploration set: {df_explore.shape[0]} rows")
#print(f"Confirmation set: {df_confirm.shape[0]} rows")

#Don't actually need to do a train-test split since we imported the training data to begin with
df_explore = df #this allows the rest of the code to continue to run 
df_confirm = df #All tests and exploration are going to take place on the training data we imported; final tests will be based on EDA as well as linear

In [77]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
X_explore_scaled = scaler.fit_transform(df_explore[features])

pipe_linear = Pipeline([
    ("interaction_terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe_linear.fit(X_explore_scaled, df_explore["y"])

linear_pred = pipe_linear.predict(X_explore_scaled)
linear_mse = mean_squared_error(df_explore["y"], linear_pred)


In [78]:
# Extract coefficients
transformed_feature_names = pipe_linear.named_steps['interaction_terms'].get_feature_names_out(features)
lasso_coeffs = pd.Series(
    pipe_linear.named_steps['lasso'].coef_,
    index=transformed_feature_names
)

# Get non-zero coefficients
sig_lasso_coeffs = lasso_coeffs[lasso_coeffs != 0]
important_features = sig_lasso_coeffs.sort_values(key=abs)

print("\nAll non-zero features sorted by coefficient magnitude:")
print(important_features)

print("\nTop 10 most important features:")
top_features = important_features.tail(10)
print(top_features)

# create hypotheses
hypotheses = []
for feature, coef in top_features.items():
    expected_direction = "positive" if coef > 0 else "negative"
    impact = "increase" if coef > 0 else "decrease"

    if " " in feature:
        # For interaction terms
        parts = feature.split(" ")
        hypothesis_text = f"Videos that combine '{parts[0]}' and '{parts[1]}' {impact} engagement"
    else:
        # For single features
        hypothesis_text = f"Videos with '{feature}' {impact} engagement"

    hypotheses.append({
        "feature": feature,
        "coefficient": coef,
        "expected_direction": expected_direction,
        "hypothesis": hypothesis_text
    })

print("\nFormulated hypotheses:")
for i, hyp in enumerate(hypotheses, 1):
    print(f"{i}. {hyp['hypothesis']} (coef={hyp['coefficient']:.6f})")




All non-zero features sorted by coefficient magnitude:
speed skills/teach                      0.000011
comparing_products hashtag_indicator    0.000022
product speed                           0.000029
budget skills/teach                    -0.000030
korean hasAdinTitle                    -0.000042
                                          ...   
korean speed                            0.001501
has_any_affiliate                       0.002245
popular_brand                          -0.002897
self_ref                                0.002901
hashtag_indicator                      -0.004031
Length: 74, dtype: float64

Top 10 most important features:
comparing_products               -0.001185
product                          -0.001287
skills/teach hashtag_indicator   -0.001350
popular_brand skills/teach        0.001364
product hashtag_indicator         0.001378
korean speed                      0.001501
has_any_affiliate                 0.002245
popular_brand                    -0.002897
s

In [79]:
df_confirm.columns

Index(['popular_brand', 'has_any_affiliate', 'product', 'budget', 'self_ref',
       'korean', 'speed', 'skills/teach', 'comparing_products', 'prime_hour',
       'hasAdinTitle', 'hasAdinText', 'hashtag_indicator', 'y', 'y2'],
      dtype='object')

In [80]:
hypotheses

[{'feature': 'comparing_products',
  'coefficient': -0.0011854043620507509,
  'expected_direction': 'negative',
  'hypothesis': "Videos with 'comparing_products' decrease engagement"},
 {'feature': 'product',
  'coefficient': -0.0012871632821651372,
  'expected_direction': 'negative',
  'hypothesis': "Videos with 'product' decrease engagement"},
 {'feature': 'skills/teach hashtag_indicator',
  'coefficient': -0.0013499872960675242,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'skills/teach' and 'hashtag_indicator' decrease engagement"},
 {'feature': 'popular_brand skills/teach',
  'coefficient': 0.0013637794553379501,
  'expected_direction': 'positive',
  'hypothesis': "Videos that combine 'popular_brand' and 'skills/teach' increase engagement"},
 {'feature': 'product hashtag_indicator',
  'coefficient': 0.0013776152077387904,
  'expected_direction': 'positive',
  'hypothesis': "Videos that combine 'product' and 'hashtag_indicator' increase engagement"},
 {'

In [81]:
from scipy import stats

# Test each hypothesis on the confirmation dataset
results = []

for hypothesis in hypotheses:
    feature = hypothesis["feature"]
    expected_direction = hypothesis["expected_direction"]

    if " " not in feature:
        if feature not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Feature not found in dataset",
                'confirmed': False
            })
            continue

        # Group data based on feature presence
        with_feature = df_confirm[df_confirm[feature] == 1]["y"]
        without_feature = df_confirm[df_confirm[feature] == 0]["y"]

        # Skip if either group is too small
        if len(with_feature) < 10 or len(without_feature) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (with={len(with_feature)}, without={len(without_feature)})",
                'confirmed': False
            })
            continue

        #  t-test
        t_stat, p_value = stats.ttest_ind(with_feature, without_feature, equal_var=False)

        # Check if result confirms hypothesis direction
        mean_diff = with_feature.mean() - without_feature.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_with': with_feature.mean(),
            'mean_without': without_feature.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })

    # For interaction terms (simplified approach)
    else:
        feature_parts = feature.split(" ")
        if len(feature_parts) != 2:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Complex interaction term - not tested",
                'confirmed': False
            })
            continue

        feature1, feature2 = feature_parts

        # Check if features exist
        if feature1 not in df_confirm.columns or feature2 not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "One or more features not found in dataset",
                'confirmed': False
            })
            continue

        # Create groups for interaction
        both_present = df_confirm[(df_confirm[feature1] == 1) & (df_confirm[feature2] == 1)]["y"]
        not_both = df_confirm[~((df_confirm[feature1] == 1) & (df_confirm[feature2] == 1))]["y"]

        # Skip if either group is too small
        if len(both_present) < 10 or len(not_both) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (both={len(both_present)}, not_both={len(not_both)})",
                'confirmed': False
            })
            continue

        # Perform t-test
        t_stat, p_value = stats.ttest_ind(both_present, not_both, equal_var=False)

        # Check direction
        mean_diff = both_present.mean() - not_both.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_both': both_present.mean(),
            'mean_not_both': not_both.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })


results_df = pd.DataFrame(results)

In [82]:
results_df.shape

(10, 14)

In [83]:

confirmed_hypotheses = results_df[results_df['hypothesis_confirmed'] == True]
confirmation_rate = len(confirmed_hypotheses) / len(results_df) * 100

print(f"\nResults Summary: {len(confirmed_hypotheses)} out of {len(results_df)} hypotheses confirmed ({confirmation_rate:.1f}%)")

# confirmed hypotheses
print("\nConfirmed Hypotheses:")
for i, row in confirmed_hypotheses.iterrows():
    if 'mean_with' in row:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with {row['feature']}: {row['mean_with']:.4f}")
        print(f"  Mean without {row['feature']}: {row['mean_without']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
    else:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with both features: {row['mean_both']:.4f}")
        print(f"  Mean without both features: {row['mean_not_both']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
# unconfirmed hypotheses
unconfirmed = results_df[results_df['hypothesis_confirmed'] != True]
print("\nUnconfirmed Hypotheses:")

for i, row in unconfirmed.iterrows():
    if 'p_value' in row and row['p_value'] is not None:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
    else:
        print(f"- {row['hypothesis']} (Reason: {row['result']})")




Results Summary: 7 out of 10 hypotheses confirmed (70.0%)

Confirmed Hypotheses:
- Videos with 'comparing_products' decrease engagement (p=0.0001)
  Mean with comparing_products: 0.0524
  Mean without comparing_products: 0.0569
  Difference: -0.0044
- Videos with 'product' decrease engagement (p=0.0076)
  Mean with product: 0.0546
  Mean without product: 0.0569
  Difference: -0.0024
- Videos that combine 'skills/teach' and 'hashtag_indicator' decrease engagement (p=0.0001)
  Mean with skills/teach hashtag_indicator: nan
  Mean without skills/teach hashtag_indicator: nan
  Difference: -0.0045
- Videos that combine 'korean' and 'speed' increase engagement (p=0.0000)
  Mean with korean speed: nan
  Mean without korean speed: nan
  Difference: 0.0114
- Videos with 'popular_brand' decrease engagement (p=0.0000)
  Mean with popular_brand: 0.0480
  Mean without popular_brand: 0.0580
  Difference: -0.0100
- Videos with 'self_ref' increase engagement (p=0.0004)
  Mean with self_ref: 0.0578
  M

In [99]:
#Now we re-run all the above code with regard to y2 = views/subscriber instead

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
X_explore_scaled = scaler.fit_transform(df_explore[features])

pipe_linear = Pipeline([
    ("interaction_terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe_linear.fit(X_explore_scaled, df_explore["y2"])

linear_pred = pipe_linear.predict(X_explore_scaled)
linear_mse = mean_squared_error(df_explore["y2"], linear_pred)


In [103]:
# Extract coefficients
transformed_feature_names = pipe_linear.named_steps['interaction_terms'].get_feature_names_out(features)
lasso_coeffs = pd.Series(
    pipe_linear.named_steps['lasso'].coef_,
    index=transformed_feature_names
)

# Get non-zero coefficients
sig_lasso_coeffs = lasso_coeffs[lasso_coeffs != 0]
important_features = sig_lasso_coeffs.sort_values(key=abs)

print("\nAll non-zero features sorted by coefficient magnitude:")
print(important_features)

print("\nTop 10 most important features:")
top_features = important_features.tail(10)
print(top_features)

# create hypotheses
hypotheses = []
for feature, coef in top_features.items():
    expected_direction = "positive" if coef > 0 else "negative"
    impact = "increase" if coef > 0 else "decrease"

    if " " in feature:
        # For interaction terms
        parts = feature.split(" ")
        hypothesis_text = f"Videos that combine '{parts[0]}' and '{parts[1]}' {impact} views"
    else:
        # For single features
        hypothesis_text = f"Videos with '{feature}' {impact} views"

    hypotheses.append({
        "feature": feature,
        "coefficient": coef,
        "expected_direction": expected_direction,
        "hypothesis": hypothesis_text
    })

print("\nFormulated hypotheses:")
for i, hyp in enumerate(hypotheses, 1):
    print(f"{i}. {hyp['hypothesis']} (coef={hyp['coefficient']:.6f})")




All non-zero features sorted by coefficient magnitude:
self_ref hasAdinTitle            -0.001582
has_any_affiliate skills/teach    0.006736
korean speed                      0.007015
skills/teach hasAdinTitle         0.009862
self_ref skills/teach             0.012974
                                    ...   
popular_brand product             0.990829
product speed                     1.126216
speed prime_hour                  1.426781
popular_brand prime_hour          1.534968
popular_brand speed               1.718733
Length: 91, dtype: float64

Top 10 most important features:
speed hasAdinText          -0.637584
korean                      0.718873
product prime_hour          0.813412
popular_brand               0.844806
prime_hour hasAdinText     -0.891425
popular_brand product       0.990829
product speed               1.126216
speed prime_hour            1.426781
popular_brand prime_hour    1.534968
popular_brand speed         1.718733
dtype: float64

Formulated hypotheses:
1.

In [105]:
hypotheses

[{'feature': 'speed hasAdinText',
  'coefficient': -0.6375841563629866,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'speed' and 'hasAdinText' decrease views"},
 {'feature': 'korean',
  'coefficient': 0.7188726902164926,
  'expected_direction': 'positive',
  'hypothesis': "Videos with 'korean' increase views"},
 {'feature': 'product prime_hour',
  'coefficient': 0.8134122047713994,
  'expected_direction': 'positive',
  'hypothesis': "Videos that combine 'product' and 'prime_hour' increase views"},
 {'feature': 'popular_brand',
  'coefficient': 0.8448064790479867,
  'expected_direction': 'positive',
  'hypothesis': "Videos with 'popular_brand' increase views"},
 {'feature': 'prime_hour hasAdinText',
  'coefficient': -0.8914245652653439,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'prime_hour' and 'hasAdinText' decrease views"},
 {'feature': 'popular_brand product',
  'coefficient': 0.9908293133774675,
  'expected_direction': 'posi

In [107]:
from scipy import stats

# Test each hypothesis on the confirmation dataset
results = []

for hypothesis in hypotheses:
    feature = hypothesis["feature"]
    expected_direction = hypothesis["expected_direction"]

    if " " not in feature:
        if feature not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Feature not found in dataset",
                'confirmed': False
            })
            continue

        # Group data based on feature presence
        with_feature = df_confirm[df_confirm[feature] == 1]["y"]
        without_feature = df_confirm[df_confirm[feature] == 0]["y"]

        # Skip if either group is too small
        if len(with_feature) < 10 or len(without_feature) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (with={len(with_feature)}, without={len(without_feature)})",
                'confirmed': False
            })
            continue

        #  t-test
        t_stat, p_value = stats.ttest_ind(with_feature, without_feature, equal_var=False)

        # Check if result confirms hypothesis direction
        mean_diff = with_feature.mean() - without_feature.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_with': with_feature.mean(),
            'mean_without': without_feature.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })

    # For interaction terms (simplified approach)
    else:
        feature_parts = feature.split(" ")
        if len(feature_parts) != 2:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Complex interaction term - not tested",
                'confirmed': False
            })
            continue

        feature1, feature2 = feature_parts

        # Check if features exist
        if feature1 not in df_confirm.columns or feature2 not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "One or more features not found in dataset",
                'confirmed': False
            })
            continue

        # Create groups for interaction
        both_present = df_confirm[(df_confirm[feature1] == 1) & (df_confirm[feature2] == 1)]["y"]
        not_both = df_confirm[~((df_confirm[feature1] == 1) & (df_confirm[feature2] == 1))]["y"]

        # Skip if either group is too small
        if len(both_present) < 10 or len(not_both) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (both={len(both_present)}, not_both={len(not_both)})",
                'confirmed': False
            })
            continue

        # Perform t-test
        t_stat, p_value = stats.ttest_ind(both_present, not_both, equal_var=False)

        # Check direction
        mean_diff = both_present.mean() - not_both.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_both': both_present.mean(),
            'mean_not_both': not_both.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })


results_df = pd.DataFrame(results)

In [109]:

confirmed_hypotheses = results_df[results_df['hypothesis_confirmed'] == True]
confirmation_rate = len(confirmed_hypotheses) / len(results_df) * 100

print(f"\nResults Summary: {len(confirmed_hypotheses)} out of {len(results_df)} hypotheses confirmed ({confirmation_rate:.1f}%)")

# confirmed hypotheses
print("\nConfirmed Hypotheses:")
for i, row in confirmed_hypotheses.iterrows():
    if 'mean_with' in row:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with {row['feature']}: {row['mean_with']:.4f}")
        print(f"  Mean without {row['feature']}: {row['mean_without']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
    else:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with both features: {row['mean_both']:.4f}")
        print(f"  Mean without both features: {row['mean_not_both']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
# unconfirmed hypotheses
unconfirmed = results_df[results_df['hypothesis_confirmed'] != True]
print("\nUnconfirmed Hypotheses:")

for i, row in unconfirmed.iterrows():
    if 'p_value' in row and row['p_value'] is not None:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
    else:
        print(f"- {row['hypothesis']} (Reason: {row['result']})")


Results Summary: 2 out of 10 hypotheses confirmed (20.0%)

Confirmed Hypotheses:
- Videos that combine 'speed' and 'hasAdinText' decrease views (p=0.0000)
  Mean with speed hasAdinText: nan
  Mean without speed hasAdinText: nan
  Difference: -0.0118
- Videos with 'korean' increase views (p=0.0099)
  Mean with korean: 0.0597
  Mean without korean: 0.0562
  Difference: 0.0035

Unconfirmed Hypotheses:
- Videos that combine 'product' and 'prime_hour' increase views (p=0.0000)
- Videos with 'popular_brand' increase views (p=0.0000)
- Videos that combine 'prime_hour' and 'hasAdinText' decrease views (p=0.3188)
- Videos that combine 'popular_brand' and 'product' increase views (p=0.0000)
- Videos that combine 'product' and 'speed' increase views (p=0.7537)
- Videos that combine 'speed' and 'prime_hour' increase views (p=0.8852)
- Videos that combine 'popular_brand' and 'prime_hour' increase views (p=0.0079)
- Videos that combine 'popular_brand' and 'speed' increase views (p=0.0006)
