In [1]:
import pandas as pd

df = pd.read_csv('../data/new/no_early_dates_all_features_train.csv')



In [2]:
df.columns

Index(['Unnamed: 0', 'commentsCount', 'isChannelVerified', 'likes',
       'numberOfSubscribers', 'text', 'title', 'viewCount',
       'views_per_subscriber', 'duration_in_seconds', 'date',
       'hashtag_indicator', 'has_any_affiliate', 'hasAdinTitle', 'hasAdinText',
       'Engagement_per_Subscriber', 'Engagement_per_View', 'popular_brand',
       'prime_hour', 'product', 'skills/teach', 'speed', 'comparing_products',
       'self_ref', 'budget', 'korean'],
      dtype='object')

In [3]:
import pandas as pd
import numpy as np

features = ["popular_brand", "has_any_affiliate", "product", "budget", "self_ref", "korean", "speed", "skills/teach", "comparing_products", "prime_hour", "hasAdinTitle", "hasAdinText",'hashtag_indicator']

#Create the target column $y$ here
df["y"] = (df["likes"] + df["commentsCount"])  / (df["viewCount"] + 1)

#get rid of noisy columns
df = df[ features + ["y"] ]

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Split data into exploration set and confirmation
df_explore, df_confirm = train_test_split(df, test_size=0.5, random_state=42)

print(f"Exploration set: {df_explore.shape[0]} rows")
print(f"Confirmation set: {df_confirm.shape[0]} rows")



Exploration set: 3968 rows
Confirmation set: 3969 rows


In [5]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
X_explore_scaled = scaler.fit_transform(df_explore[features])

pipe_linear = Pipeline([
    ("interaction_terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe_linear.fit(X_explore_scaled, df_explore["y"])

linear_pred = pipe_linear.predict(X_explore_scaled)
linear_mse = mean_squared_error(df_explore["y"], linear_pred)


#RF
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_explore_scaled, df_explore["y"])
rf_pred = rf.predict(X_explore_scaled)
rf_mse = mean_squared_error(df_explore["y"], rf_pred)

# Calculate performance ratio on real data
real_ratio = linear_mse / rf_mse

print("Performance on real data:")
print(f"Linear model MSE: {linear_mse:.6f}")
print(f"Random Forest MSE: {rf_mse:.6f}")
print(f"Improvement ratio: {real_ratio:.2f}x (how much better RF is than linear)")


# Generate simulated data under hyphothesis 0
residuals = df_explore["y"] - linear_pred
std_residuals = np.std(residuals)
np.random.seed(42)
y_simulated = linear_pred + np.random.normal(0, std_residuals, len(linear_pred))

pipe_linear_sim = Pipeline([
    ("interaction_terms", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("lasso", Lasso(alpha=0.0001, max_iter=10000))
])
pipe_linear_sim.fit(X_explore_scaled, y_simulated)
linear_sim_pred = pipe_linear_sim.predict(X_explore_scaled)
linear_sim_mse = mean_squared_error(y_simulated, linear_sim_pred)

rf_sim = RandomForestRegressor(n_estimators=100, random_state=42)
rf_sim.fit(X_explore_scaled, y_simulated)
rf_sim_pred = rf_sim.predict(X_explore_scaled)
rf_sim_mse = mean_squared_error(y_simulated, rf_sim_pred)

sim_ratio = linear_sim_mse / rf_sim_mse

print("\nPerformance on simulated linear data:")
print(f"Linear model MSE: {linear_sim_mse:.6f}")
print(f"Random Forest MSE: {rf_sim_mse:.6f}")
print(f"Improvement ratio: {sim_ratio:.2f}x (how much better RF is than linear)")



Performance on real data:
Linear model MSE: 0.000999
Random Forest MSE: 0.000784
Improvement ratio: 1.27x (how much better RF is than linear)

Performance on simulated linear data:
Linear model MSE: 0.000970
Random Forest MSE: 0.000801
Improvement ratio: 1.21x (how much better RF is than linear)


In [6]:
# Extract coefficients
transformed_feature_names = pipe_linear.named_steps['interaction_terms'].get_feature_names_out(features)
lasso_coeffs = pd.Series(
    pipe_linear.named_steps['lasso'].coef_,
    index=transformed_feature_names
)

# Get non-zero coefficients
sig_lasso_coeffs = lasso_coeffs[lasso_coeffs != 0]
important_features = sig_lasso_coeffs.sort_values(key=abs)

print("\nAll non-zero features sorted by coefficient magnitude:")
print(important_features)

print("\nTop 10 most important features:")
top_features = important_features.tail(10)
print(top_features)

# create hypotheses
hypotheses = []
for feature, coef in top_features.items():
    expected_direction = "positive" if coef > 0 else "negative"
    impact = "increase" if coef > 0 else "decrease"

    if " " in feature:
        # For interaction terms
        parts = feature.split(" ")
        hypothesis_text = f"Videos that combine '{parts[0]}' and '{parts[1]}' {impact} engagement"
    else:
        # For single features
        hypothesis_text = f"Videos with '{feature}' {impact} engagement"

    hypotheses.append({
        "feature": feature,
        "coefficient": coef,
        "expected_direction": expected_direction,
        "hypothesis": hypothesis_text
    })

print("\nFormulated hypotheses:")
for i, hyp in enumerate(hypotheses, 1):
    print(f"{i}. {hyp['hypothesis']} (coef={hyp['coefficient']:.6f})")




All non-zero features sorted by coefficient magnitude:
skills/teach hasAdinText       -0.000013
budget comparing_products      -0.000022
prime_hour hasAdinTitle         0.000033
prime_hour hashtag_indicator    0.000035
popular_brand budget            0.000044
                                  ...   
prime_hour                      0.002429
product hashtag_indicator       0.002747
hashtag_indicator              -0.003010
popular_brand skills/teach      0.003132
popular_brand                  -0.003172
Length: 75, dtype: float64

Top 10 most important features:
skills/teach                       0.001843
self_ref                           0.001880
popular_brand hashtag_indicator   -0.001995
skills/teach hashtag_indicator    -0.002008
self_ref speed                    -0.002370
prime_hour                         0.002429
product hashtag_indicator          0.002747
hashtag_indicator                 -0.003010
popular_brand skills/teach         0.003132
popular_brand                     -0.

In [7]:
df_confirm.columns

Index(['popular_brand', 'has_any_affiliate', 'product', 'budget', 'self_ref',
       'korean', 'speed', 'skills/teach', 'comparing_products', 'prime_hour',
       'hasAdinTitle', 'hasAdinText', 'hashtag_indicator', 'y'],
      dtype='object')

In [8]:
hypotheses

[{'feature': 'skills/teach',
  'coefficient': 0.0018434095237098594,
  'expected_direction': 'positive',
  'hypothesis': "Videos with 'skills/teach' increase engagement"},
 {'feature': 'self_ref',
  'coefficient': 0.0018799266533280502,
  'expected_direction': 'positive',
  'hypothesis': "Videos with 'self_ref' increase engagement"},
 {'feature': 'popular_brand hashtag_indicator',
  'coefficient': -0.001995424822419821,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'popular_brand' and 'hashtag_indicator' decrease engagement"},
 {'feature': 'skills/teach hashtag_indicator',
  'coefficient': -0.00200766036931156,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'skills/teach' and 'hashtag_indicator' decrease engagement"},
 {'feature': 'self_ref speed',
  'coefficient': -0.0023700812120385465,
  'expected_direction': 'negative',
  'hypothesis': "Videos that combine 'self_ref' and 'speed' decrease engagement"},
 {'feature': 'prime_hour',
 

In [9]:

from scipy import stats

# Test each hypothesis on the confirmation dataset
results = []

for hypothesis in hypotheses:
    feature = hypothesis["feature"]
    expected_direction = hypothesis["expected_direction"]

    if " " not in feature:
        if feature not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Feature not found in dataset",
                'confirmed': False
            })
            continue

        # Group data based on feature presence
        with_feature = df_confirm[df_confirm[feature] == 1]["y"]
        without_feature = df_confirm[df_confirm[feature] == 0]["y"]

        # Skip if either group is too small
        if len(with_feature) < 10 or len(without_feature) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (with={len(with_feature)}, without={len(without_feature)})",
                'confirmed': False
            })
            continue

        #  t-test
        t_stat, p_value = stats.ttest_ind(with_feature, without_feature, equal_var=False)

        # Check if result confirms hypothesis direction
        mean_diff = with_feature.mean() - without_feature.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_with': with_feature.mean(),
            'mean_without': without_feature.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })

    # For interaction terms (simplified approach)
    else:
        feature_parts = feature.split(" ")
        if len(feature_parts) != 2:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "Complex interaction term - not tested",
                'confirmed': False
            })
            continue

        feature1, feature2 = feature_parts

        # Check if features exist
        if feature1 not in df_confirm.columns or feature2 not in df_confirm.columns:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': "One or more features not found in dataset",
                'confirmed': False
            })
            continue

        # Create groups for interaction
        both_present = df_confirm[(df_confirm[feature1] == 1) & (df_confirm[feature2] == 1)]["y"]
        not_both = df_confirm[~((df_confirm[feature1] == 1) & (df_confirm[feature2] == 1))]["y"]

        # Skip if either group is too small
        if len(both_present) < 10 or len(not_both) < 10:
            results.append({
                'feature': feature,
                'hypothesis': hypothesis["hypothesis"],
                'result': f"Insufficient data (both={len(both_present)}, not_both={len(not_both)})",
                'confirmed': False
            })
            continue

        # Perform t-test
        t_stat, p_value = stats.ttest_ind(both_present, not_both, equal_var=False)

        # Check direction
        mean_diff = both_present.mean() - not_both.mean()
        direction_confirmed = (
            (expected_direction == "positive" and mean_diff > 0) or
            (expected_direction == "negative" and mean_diff < 0)
        )

        # Record results
        results.append({
            'feature': feature,
            'hypothesis': hypothesis["hypothesis"],
            'mean_both': both_present.mean(),
            'mean_not_both': not_both.mean(),
            'difference': mean_diff,
            'p_value': p_value,
            'significant': p_value < 0.05,
            't_statistic': t_stat,
            'expected_direction': expected_direction,
            'actual_direction': "positive" if mean_diff > 0 else "negative",
            'direction_confirmed': direction_confirmed,
            'hypothesis_confirmed': direction_confirmed and p_value < 0.05
        })


results_df = pd.DataFrame(results)

In [10]:
results_df.shape

(10, 14)

In [11]:

confirmed_hypotheses = results_df[results_df['hypothesis_confirmed'] == True]
confirmation_rate = len(confirmed_hypotheses) / len(results_df) * 100

print(f"\nResults Summary: {len(confirmed_hypotheses)} out of {len(results_df)} hypotheses confirmed ({confirmation_rate:.1f}%)")

# confirmed hypotheses
print("\nConfirmed Hypotheses:")
for i, row in confirmed_hypotheses.iterrows():
    if 'mean_with' in row:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with {row['feature']}: {row['mean_with']:.4f}")
        print(f"  Mean without {row['feature']}: {row['mean_without']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
    else:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
        print(f"  Mean with both features: {row['mean_both']:.4f}")
        print(f"  Mean without both features: {row['mean_not_both']:.4f}")
        print(f"  Difference: {row['difference']:.4f}")
# unconfirmed hypotheses
unconfirmed = results_df[results_df['hypothesis_confirmed'] != True]
print("\nUnconfirmed Hypotheses:")

for i, row in unconfirmed.iterrows():
    if 'p_value' in row and row['p_value'] is not None:
        print(f"- {row['hypothesis']} (p={row['p_value']:.4f})")
    else:
        print(f"- {row['hypothesis']} (Reason: {row['result']})")




Results Summary: 4 out of 10 hypotheses confirmed (40.0%)

Confirmed Hypotheses:
- Videos with 'self_ref' increase engagement (p=0.0005)
  Mean with self_ref: 0.0581
  Mean without self_ref: 0.0544
  Difference: 0.0037
- Videos with 'prime_hour' increase engagement (p=0.0007)
  Mean with prime_hour: 0.0581
  Mean without prime_hour: 0.0545
  Difference: 0.0036
- Videos with 'hashtag_indicator' decrease engagement (p=0.0001)
  Mean with hashtag_indicator: 0.0527
  Mean without hashtag_indicator: 0.0584
  Difference: -0.0057
- Videos with 'popular_brand' decrease engagement (p=0.0000)
  Mean with popular_brand: 0.0489
  Mean without popular_brand: 0.0575
  Difference: -0.0087

Unconfirmed Hypotheses:
- Videos with 'skills/teach' increase engagement (p=0.0637)
- Videos that combine 'popular_brand' and 'hashtag_indicator' decrease engagement (p=0.2639)
- Videos that combine 'skills/teach' and 'hashtag_indicator' decrease engagement (p=0.5387)
- Videos that combine 'self_ref' and 'speed' d

In [27]:
df["hashtag_indicator"].sample(10)

#As you can see hashtag_indicator is not a categorical variable. 
#We will disregard results about it when it comes to the above analysis as we misunderstood
#The rest of the results about which categories contribute to the linear model we will still use

3918    1
5365    0
4720    1
4549    6
6274    5
3980    5
903     0
2309    0
6448    0
2818    1
Name: hashtag_indicator, dtype: int64