In [98]:
#all imports for eda
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [101]:
df  = pd.read_csv('data_no_spaces_all_features_correct_dates.csv')
df.columns

In [102]:
# print columns title, text, id, order, likes, viewCount, commentsCount
subdf = df[['title', 'text', 'id', 'likes', 'viewCount', 'commentsCount', 'duration_in_seconds']]

# remove rows with NaN values
# subdf = subdf.dropna()
# print rows with 'ad' as a substring in the 'title' column or in the 'text' column
# DataFrame for rows where 'title' contains 'ad'
subdf['hasAdinTitle'] = subdf['title'].str.lower().str.contains('ad|sponsored|collaboration|promo|partner|affiliate|paid|gift', case=False, na=False).astype(int)
subdf['hasAdinText'] = subdf['text'].str.lower().str.contains('ad|sponsored|collaboration|promo|partner|affiliate|paid|gift', case=False, na=False).astype(int)

subdf_stats = subdf.groupby(['hasAdinTitle', 'hasAdinText'])[['viewCount', 'likes', 'commentsCount']].agg(['mean', 'median', 'count'])
subdf_stats



In [103]:
_, axes = plt.subplots(2, 3, figsize=(15, 10))

# plot for hasadintitle
sns.boxplot(x='hasAdinTitle', y='viewCount', data=subdf, ax=axes[0, 0])
sns.boxplot(x='hasAdinTitle', y='likes', data=subdf, ax=axes[0, 1])
sns.boxplot(x='hasAdinTitle', y='commentsCount', data=subdf, ax=axes[0, 2])
axes[0, 0].set_title('View Count by Ad in Title')
axes[0, 1].set_title('Likes by Ad in Title')
axes[0, 2].set_title('Comments Count by Ad in Title')

# plot for hasadintext
sns.boxplot(x='hasAdinText', y='viewCount', data=subdf, ax=axes[1, 0])
sns.boxplot(x='hasAdinText', y='likes', data=subdf, ax=axes[1, 1])
sns.boxplot(x='hasAdinText', y='commentsCount', data=subdf, ax=axes[1, 2])
axes[1, 0].set_title('View Count by Ad in Text')
axes[1, 1].set_title('Likes by Ad in Text')
axes[1, 2].set_title('Comments Count by Ad in Text')

plt.tight_layout()
plt.show()


In [104]:
df_corr = subdf[['viewCount', 'likes', 'commentsCount', 'hasAdinTitle', 'hasAdinText']]

corr_matrix = df_corr.corr(method='pearson')
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix')
plt.show()

In [105]:
# perform a t-test
from scipy import stats
# t-test for viewCount
metrics = ['viewCount', 'likes', 'commentsCount']
for metric in metrics:
    t_stat, p_value = stats.ttest_ind(subdf.loc[subdf['hasAdinTitle'] == 1, metric], subdf.loc[subdf['hasAdinTitle'] == 0, metric], equal_var=False)
    print(f'T-test for {metric}: t-statistic = {t_stat}, p-value = {p_value}')
    t_stat_text, p_value_text = stats.ttest_ind(subdf.loc[subdf['hasAdinText'] == 1, metric], subdf.loc[subdf['hasAdinText'] == 0, metric], equal_var=False)
    print(f'T-test for {metric} by Ad in Text: t-statistic = {t_stat_text}, p-value = {p_value_text}')




### Engagement on videos with ads vs without ads
- When the title of a video contains any hashtag related to ads, the views and likes don't differ compared to video with titles not containing ads
- One key difference is in the number of comments that are left on videos with title containing hashtags related to ads. The comments drop significantly
- When the text of a video contains any hashtag related to ads, all metrics of engagement drop significantly.

This suggests that title of a video containing the information of it being an ad doesn't affect viewership or likability of the video but people participate less actively, i.e., they tend to not leave comments on such videos

It also suggests that the text of a video containing ad related hashtags negatively affects engagement, suggesting people are averse to promotional content.

From this we can also say that the title is of less importance when it comes to engagement with a ad video or otherwise.

In [106]:
#get a list of all unique hashtags from title column and text column
hashtags = df['title'].str.extractall(r'#(\w+)')[0].value_counts()
hashtags.to_csv('unique_hashtags_in_title.csv')
hashtags = df['text'].str.extractall(r'#(\w+)')[0].value_counts()
hashtags.to_csv('unique_hashtags_in_text.csv')

In [108]:
# Filter out videos from before 15 October 2024
filtered_df = df[df['datetime_date'] >= '2024-10-15']

# Create subdf2 with the filtered data
subdf2 = filtered_df[filtered_df['duration_in_seconds'] > 60][['duration_in_seconds', 'viewCount', 'likes', 'commentsCount']]
subdf2.describe()

In [109]:
# Use subdf2 as df_corr2
df_corr2 = subdf2[['duration_in_seconds', 'viewCount', 'likes', 'commentsCount']]

pearson_corr = df_corr2.corr(method='pearson')
spearman_corr = df_corr2.corr(method='spearman')

# Plot Pearson correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Pearson Correlation Matrix')
plt.show()

# Plot Spearman correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Spearman Correlation Matrix')
plt.show()


### Correlation between video length and views likes & comments
There is no direction relation between duration and any of the given metrics

In [111]:
for metric in ['viewCount', 'likes', 'commentsCount']:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='duration_in_seconds', y=metric, data=subdf2, alpha=0.3)
    sns.regplot(x='duration_in_seconds', y=metric, data=subdf2, scatter=False, color='red')
    plt.yscale('log')
    plt.title(f'{metric} vs. Duration in Seconds (Log Scale)')
    plt.xlabel('Duration (seconds)')
    plt.ylabel(metric)
    plt.show()

### Engagement trend for longer duration videos
As the length of the video increases the engagement on the video drops, not too significantly but it does drop. 

In [112]:
# Filter the dataframe
filtered_df = df[df['datetime_date'] >= '2024-10-15']

# Create duration bins
filtered_df['duration_bin'] = pd.cut(filtered_df['duration_in_seconds'], bins=[0, 30, 60, float('inf')], labels=['Short', 'Medium', 'Long'])

# Perform t-tests
for metric in metrics:
    for group1, group2 in [('Short', 'Medium'), ('Medium', 'Long'), ('Short', 'Long')]:
        t_stat, p_value = stats.ttest_ind(
            filtered_df.loc[filtered_df['duration_bin'] == group1, metric],
            filtered_df.loc[filtered_df['duration_bin'] == group2, metric],
            equal_var=False
        )
        print(f'T-test for {metric} ({group1} vs {group2}): t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}')





### Comparison between different video lengths
- Short vs Medium : Shorter length videos have lesser views, likes and comments compared to Medium length videos
- Medium vs Long : Medium length videos have more views, likes and comments compared to Medium length videos
- Short vs Long : Shorter videos have less likes and comments but the difference is not significant


In [113]:
for metric in ['viewCount', 'likes', 'commentsCount']:
    plt.figure(figsize=(8, 6))
    sns.barplot(x='duration_bin', y=metric, data=filtered_df)
    plt.title(f'{metric} by Duration Bin')
    plt.xlabel('Duration Bin')
    plt.ylabel(metric)
    plt.show()

### Engagement trend by duration
From the above bar plots its evident that medium duration (30-60s) videos perform the best in terms of views, likes and comments.

In [116]:
import statsmodels.api as sm    

subdf['datetime_date'] = pd.to_datetime(df['datetime_date'], errors='coerce')
subdf['hasAdinText'] = subdf['hasAdinText'].astype(int)
subdf['hasAdinTitle'] = subdf['hasAdinTitle'].astype(int)
# Filter the data
filtered_subdf = subdf[(subdf['datetime_date'] >= '2024-10-15') & 
                       (subdf['duration_in_seconds'] < 60) & 
                       (subdf['viewCount'] < 0.2e8) & 
                       (subdf['likes'] < 1e6) & 
                       (subdf['commentsCount'] < 10000)]

for metric in ['viewCount', 'likes', 'commentsCount']:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='duration_in_seconds', y=metric, hue='hasAdinText', data=filtered_subdf, alpha=0.5)
    sns.regplot(x='duration_in_seconds', y=metric, data=filtered_subdf, scatter=False, color='red')
    # sns.regplot(x='hasAdinText', y=metric, data=filtered_subdf, scatter=False, color='green')
    plt.title(f'{metric} vs. Duration by Ad in Text')
    plt.xlabel('Duration (seconds)')
    plt.ylabel(metric)
    # plt.yscale('log')
    plt.show()

    # Regression with interaction
    X = filtered_subdf[['duration_in_seconds', 'hasAdinText']]
    X['duration_ad'] = X['duration_in_seconds'] * X['hasAdinText']
    X = sm.add_constant(X)
    y = filtered_subdf[metric]
    model = sm.OLS(y, X).fit()
    print(f"\nInteraction Regression for {metric}:")
    print(model.summary())















In [118]:
for metric in ['viewCount', 'likes', 'commentsCount']:
    median_val = filtered_df[metric].median()
    plt.figure(figsize=(10, 6))
    sns.kdeplot(data=filtered_df[filtered_df[metric] > median_val], x='duration_in_seconds', label='High', fill=True)
    sns.kdeplot(data=filtered_df[filtered_df[metric] <= median_val], x='duration_in_seconds', label='Low', fill=True)
    plt.title(f'Duration Density: High vs. Low {metric}')
    plt.xlabel('Duration (seconds)')
    plt.legend()
    plt.show()