In [16]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu

In [3]:
df = pd.read_csv('../data/new/no_early_dates_30_days_test.csv')
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'channelDescription', 'channelJoinedDate',
       'channelTotalVideos', 'channelTotalViews', 'channelUsername',
       'commentsCount', 'date', 'duration', 'id', 'isChannelVerified', 'likes',
       'numberOfSubscribers', 'order', 'text', 'title', 'url', 'viewCount',
       'likes_per_subscriber', 'comments_per_subscriber',
       'views_per_subscriber', 'duration_in_seconds', 'datetime',
       'datetime_date'],
      dtype='object')

In [5]:
df['views_per_subscriber'] = df['viewCount'] / df['numberOfSubscribers']

# Handle potential issues (e.g., division by zero, missing values)
df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities
df = df.dropna(subset=['views_per_subscriber'])  # Drop rows with NaN

In [7]:
if 'hasHashtagTitle' not in df.columns:
    df['hasHashtagTitle'] = df['title'].str.contains(r'#\w+', case=False, na=False).astype(int)
if 'hasHashtagDescription' not in df.columns:
    df['hasHashtagDescription'] = df['text'].str.contains(r'#\w+', case=False, na=False).astype(int)

# Create a combined feature: 1 if hashtag in either title or description, 0 otherwise
df['has_hashtag_any'] = ((df['hasHashtagTitle'] == 1) | (df['hasHashtagDescription'] == 1)).astype(int)

# Step 3: Split data into two groups
group_with_hashtag = df[df['has_hashtag_any'] == 1]['views_per_subscriber']
group_without_hashtag = df[df['has_hashtag_any'] == 0]['views_per_subscriber']

In [8]:
t_stat, p_value_two_tailed = ttest_ind(group_with_hashtag, group_without_hashtag, equal_var=False)

if t_stat > 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

mean_with = group_with_hashtag.mean()
mean_without = group_without_hashtag.mean()
std_with = group_with_hashtag.std(ddof=1)
std_without = group_without_hashtag.std(ddof=1)
n_with = len(group_with_hashtag)
n_without = len(group_without_hashtag)

pooled_std = np.sqrt(((n_with - 1) * std_with**2 + (n_without - 1) * std_without**2) / (n_with + n_without - 2))
cohens_d = (mean_with - mean_without) / pooled_std

In [9]:
print(f"T-test for views_per_subscriber by Hashtag in Title or Description:")
print(f"T-statistic = {t_stat:.4f}, p-value (one-tailed) = {p_value_one_tailed:.4f}")
print(f"Effect Size (Cohen's d) = {cohens_d:.4f}")
print(f"Mean views_per_subscriber (with hashtag) = {mean_with:.4f}")
print(f"Mean views_per_subscriber (without hashtag) = {mean_without:.4f}")
print(f"Sample size (with hashtag) = {n_with}, (without hashtag) = {n_without}")

alpha = 0.05
if p_value_one_tailed < alpha and t_stat > 0:
    print("Reject the null hypothesis: Hashtags in title or description significantly increase the mean views_per_subscriber.")
else:
    print("Fail to reject the null hypothesis: No significant evidence that hashtags in title or description increase the mean views_per_subscriber.")

T-test for views_per_subscriber by Hashtag in Title or Description:
T-statistic = 4.0818, p-value (one-tailed) = 0.0000
Effect Size (Cohen's d) = 0.0816
Mean views_per_subscriber (with hashtag) = 0.9418
Mean views_per_subscriber (without hashtag) = 0.5027
Sample size (with hashtag) = 5108, (without hashtag) = 2829
Reject the null hypothesis: Hashtags in title or description significantly increase the mean views_per_subscriber.


In [None]:
stat, p_value = mannwhitneyu(group_with_hashtag, group_without_hashtag, alternative='greater')
print(f"Mann-Whitney U test: stat = {stat:.4f}, p-value (one-tailed) = {p_value:.4f}")

Mann-Whitney U test: stat = 8216485.5000, p-value (one-tailed) = 0.0000


In [12]:
if 'has_any_affiliate' not in df.columns:
    # Define patterns for affiliate links or discount codes (adjust based on your EDA definition)
    affiliate_patterns = r'(affiliate|discount|code|link|promo|shop|deal|partner)'
    df['has_affiliate_title'] = df['title'].str.contains(affiliate_patterns, case=False, na=False).astype(int)
    df['has_affiliate_description'] = df['title'].str.contains(affiliate_patterns, case=False, na=False).astype(int)
    df['has_affiliate_channel'] = df['channelDescription'].str.contains(affiliate_patterns, case=False, na=False).astype(int)
    df['has_any_affiliate'] = ((df['has_affiliate_title'] == 1) | 
                               (df['has_affiliate_description'] == 1) | 
                               (df['has_affiliate_channel'] == 1)).astype(int)

  df['has_affiliate_title'] = df['title'].str.contains(affiliate_patterns, case=False, na=False).astype(int)
  df['has_affiliate_description'] = df['title'].str.contains(affiliate_patterns, case=False, na=False).astype(int)
  df['has_affiliate_channel'] = df['channelDescription'].str.contains(affiliate_patterns, case=False, na=False).astype(int)


In [14]:
group_with_affiliate = df[df['has_any_affiliate'] == 1]['views_per_subscriber']
group_without_affiliate = df[df['has_any_affiliate'] == 0]['views_per_subscriber']

# Step 4: Perform Welch's T-test (one-tailed, testing if mean with affiliate < mean without affiliate)
t_stat, p_value_two_tailed = ttest_ind(group_with_affiliate, group_without_affiliate, equal_var=False)

# Since we're testing for a decrease (mean with affiliate < mean without), adjust the p-value
# If t_stat < 0, the mean with affiliate is lower, so divide the two-tailed p-value by 2
# If t_stat > 0, the mean with affiliate is higher, so the p-value for "less than" is 1 - (two-tailed p-value / 2)
if t_stat < 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

# Step 5: Compute Cohen's d for effect size
mean_with = group_with_affiliate.mean()
mean_without = group_without_affiliate.mean()
std_with = group_with_affiliate.std(ddof=1)
std_without = group_without_affiliate.std(ddof=1)
n_with = len(group_with_affiliate)
n_without = len(group_without_affiliate)

# Pooled standard deviation for Cohen's d
pooled_std = np.sqrt(((n_with - 1) * std_with**2 + (n_without - 1) * std_without**2) / (n_with + n_without - 2))
cohens_d = (mean_with - mean_without) / pooled_std

In [15]:
print(f"T-test for views_per_subscriber by Affiliate Link:")
print(f"T-statistic = {t_stat:.4f}, p-value (one-tailed) = {p_value_one_tailed:.4f}")
print(f"Effect Size (Cohen's d) = {cohens_d:.4f}")
print(f"Mean views_per_subscriber (with affiliate) = {mean_with:.4f}")
print(f"Mean views_per_subscriber (without affiliate) = {mean_without:.4f}")
print(f"Sample size (with affiliate) = {n_with}, (without affiliate) = {n_without}")

# Step 7: Interpret the result
alpha = 0.05
if p_value_one_tailed < alpha and t_stat < 0:
    print("Reject the null hypothesis: Affiliate links significantly decrease the mean views_per_subscriber.")
else:
    print("Fail to reject the null hypothesis: No significant evidence that affiliate links decrease the mean views_per_subscriber.")

T-test for views_per_subscriber by Affiliate Link:
T-statistic = 0.8385, p-value (one-tailed) = 0.7991
Effect Size (Cohen's d) = 0.0195
Mean views_per_subscriber (with affiliate) = 0.8654
Mean views_per_subscriber (without affiliate) = 0.7606
Sample size (with affiliate) = 1870, (without affiliate) = 6067
Fail to reject the null hypothesis: No significant evidence that affiliate links decrease the mean views_per_subscriber.


In [17]:
stat, p_value = mannwhitneyu(group_with_affiliate, group_without_affiliate, alternative='less')
print(f"Mann-Whitney U test: stat = {stat:.4f}, p-value (one-tailed) = {p_value:.4f}")

Mann-Whitney U test: stat = 5983580.5000, p-value (one-tailed) = 0.9998


In [18]:
ad_pattern = r'\b(ad|sponsored|advertisement|promo|promotion|paid partnership|collaboration|partnership|endorsement)\b'  # Match whole words like "ad", "sponsored", etc.
df['has_ad_in_description'] = df['text'].str.contains(ad_pattern, case=False, na=False).astype(int)
df['has_ad_in_title'] = df['title'].str.contains(ad_pattern, case=False, na=False).astype(int)

# Create the feature: 1 if "ad" is in description but not in title, 0 if "ad" is not in description and not in title
# Exclude videos where "ad" is in the title
df['has_ad_in_description_only'] = 0
df.loc[(df['has_ad_in_description'] == 1) & (df['has_ad_in_title'] == 0), 'has_ad_in_description_only'] = 1
df.loc[(df['has_ad_in_description'] == 0) & (df['has_ad_in_title'] == 0), 'has_ad_in_description_only'] = 0
# Videos with "ad" in the title are excluded (set to NaN and will be dropped)
df.loc[df['has_ad_in_title'] == 1, 'has_ad_in_description_only'] = np.nan

# Drop rows where "ad" is in the title (to isolate the effect of description)
df = df.dropna(subset=['has_ad_in_description_only'])

# Step 3: Split data into two groups
group_with_ad = df[df['has_ad_in_description_only'] == 1]['views_per_subscriber']
group_without_ad = df[df['has_ad_in_description_only'] == 0]['views_per_subscriber']

# Step 4: Perform Welch's T-test (one-tailed, testing if mean with ad < mean without ad)
t_stat, p_value_two_tailed = ttest_ind(group_with_ad, group_without_ad, equal_var=False)

# Since we're testing for a decrease (mean with ad < mean without), adjust the p-value
# If t_stat < 0, the mean with ad is lower, so divide the two-tailed p-value by 2
# If t_stat > 0, the mean with ad is higher, so the p-value for "less than" is 1 - (two-tailed p-value / 2)
if t_stat < 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

# Step 5: Compute Cohen's d for effect size
mean_with = group_with_ad.mean()
mean_without = group_without_ad.mean()
std_with = group_with_ad.std(ddof=1)
std_without = group_without_ad.std(ddof=1)
n_with = len(group_with_ad)
n_without = len(group_without_ad)

# Pooled standard deviation for Cohen's d
pooled_std = np.sqrt(((n_with - 1) * std_with**2 + (n_without - 1) * std_without**2) / (n_with + n_without - 2))
cohens_d = (mean_with - mean_without) / pooled_std

  df['has_ad_in_description'] = df['text'].str.contains(ad_pattern, case=False, na=False).astype(int)
  df['has_ad_in_title'] = df['title'].str.contains(ad_pattern, case=False, na=False).astype(int)


In [19]:
print(f"T-test for views_per_subscriber by 'ad' in Description (not in Title):")
print(f"T-statistic = {t_stat:.4f}, p-value (one-tailed) = {p_value_one_tailed:.4f}")
print(f"Effect Size (Cohen's d) = {cohens_d:.4f}")
print(f"Mean views_per_subscriber (with ad in description) = {mean_with:.4f}")
print(f"Mean views_per_subscriber (without ad in description) = {mean_without:.4f}")
print(f"Sample size (with ad in description) = {n_with}, (without ad in description) = {n_without}")

# Step 7: Interpret the result
alpha = 0.05
if p_value_one_tailed < alpha and t_stat < 0:
    print("Reject the null hypothesis: 'ad' in the description (not in title) significantly reduces the mean views_per_subscriber.")
else:
    print("Fail to reject the null hypothesis: No significant evidence that 'ad' in the description (not in title) reduces the mean views_per_subscriber.")

T-test for views_per_subscriber by 'ad' in Description (not in Title):
T-statistic = 1.3666, p-value (one-tailed) = 0.9134
Effect Size (Cohen's d) = 0.1912
Mean views_per_subscriber (with ad in description) = 1.7954
Mean views_per_subscriber (without ad in description) = 0.7623
Sample size (with ad in description) = 201, (without ad in description) = 7673
Fail to reject the null hypothesis: No significant evidence that 'ad' in the description (not in title) reduces the mean views_per_subscriber.


In [20]:
from scipy.stats import mannwhitneyu
stat, p_value = mannwhitneyu(group_with_ad, group_without_ad, alternative='less')
print(f"Mann-Whitney U test: stat = {stat:.4f}, p-value (one-tailed) = {p_value:.4f}")

Mann-Whitney U test: stat = 531291.0000, p-value (one-tailed) = 0.0000
