In this notebook, we re-run our EDA notebooks on the testing set to create a full dataframe with all the features we created. This is so we can run t-tests on the testing set. PLEASE NOTE we have taken out all of the analysis from this notebook. We are NOT looking at ANY of the analysis contained in this notebook. We are merely re-using some of the code to create the necessary features. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
from scipy import stats
from scipy.stats import zscore, ttest_ind, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [3]:
df = pd.read_csv('../data/new/no_early_dates_30_days_test.csv')
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'channelDescription', 'channelJoinedDate',
       'channelTotalVideos', 'channelTotalViews', 'channelUsername',
       'commentsCount', 'date', 'duration', 'id', 'isChannelVerified', 'likes',
       'numberOfSubscribers', 'order', 'text', 'title', 'url', 'viewCount',
       'likes_per_subscriber', 'comments_per_subscriber',
       'views_per_subscriber', 'duration_in_seconds', 'datetime',
       'datetime_date'],
      dtype='object')

In [4]:
# data cleaning
df["text"] = df["text"].fillna("")
df["channelDescription"] = df["channelDescription"].fillna("")
# df = df.drop("channelLocation", axis = 1)
df = df.dropna().copy()

def convert_duration_to_int(item):
    item_as_datetime = datetime.datetime.strptime( item , "%H:%M:%S"  ) #converts string to a datetime object
    seconds = item_as_datetime.second + 60 * item_as_datetime.minute + 3600 * item_as_datetime.hour
    return seconds
    
df["duration_in_seconds"] = df["duration"].apply(convert_duration_to_int) 

In [5]:
# target variables
df["likes_per_subscriber"] = df["likes"]/df["numberOfSubscribers"]
df["comments_per_subscriber"] = df["commentsCount"]/df["numberOfSubscribers"]
df["views_per_subscriber"] = df["viewCount"]/df["numberOfSubscribers"]

In [6]:
# eda : notebook 1 analyses 
# eda_p1 : hashtags in title
df = df.copy()

df["hashtag_indicator"] = df["title"].str.count("#")

# eda_p4 : hashtags in description
df["ht_desc_ind"] = df["text"].str.count("#") 

df["any_ht"] = (df["title"] + df["text"]).str.count("#")

In [8]:
#eda : notebook 2 analyses
#eda_p5 : affiliate links and discount codes
affiliate_patterns = [
    r'aff(iliate)?[ -]?link',
    r'ref(erral)?[ -]?link',
    r'partner[ -]?link',
    r'sponsored[ -]?link',
    r'^http(s)?:\/\/(www\.)?(amzn\.to|amazon\.[a-z\.]+\/[^\s]+tag=)',
    r'go\.magik\.ly',
    r'liketoknow\.it',
    r'prf\.hn',
    r'shareasale',
    r'rewardstyle',
    r'linktr\.ee'
]

discount_patterns = [
    r'disc(ount)?[ -]?code',
    r'coupon[ -]?code',
    r'promo[ -]?code',
    r'save \d+%',
    r'\d+%[ -]?off',
    r'use code[: ][a-z0-9_]+'
]


business_patterns = [
    r'business inquir(y|ies)',
    r'collaborations?',
    r'sponsorships?',
    r'partnerships?',
    r'for business',
    r'contact(\s+me)?(\s+for)?(\s+business)?'
]


def has_affiliate_or_discount(text):
    if pd.isna(text):
        return False

    has_affiliate = any(re.search(pattern, text, re.IGNORECASE) for pattern in affiliate_patterns)

    has_discount = any(re.search(pattern, text, re.IGNORECASE) for pattern in discount_patterns)

    return has_affiliate or has_discount

def has_business_inquiry(text):
    if pd.isna(text):
        return False

    return any(re.search(pattern, text, re.IGNORECASE) for pattern in business_patterns)

df['has_title_affiliate'] = df['title'].apply(has_affiliate_or_discount)
df['has_description_affiliate'] = df['text'].apply(has_affiliate_or_discount)
df['has_channel_description_affiliate'] = df['channelDescription'].apply(has_affiliate_or_discount)
df['has_any_affiliate'] = df['has_title_affiliate'] | df['has_description_affiliate'] | df['has_channel_description_affiliate']

df['has_business_inquiry'] = df['channelDescription'].apply(has_business_inquiry)

total_videos = len(df)
title_affiliates = df['has_title_affiliate'].sum()
description_affiliates = df['has_description_affiliate'].sum()
channel_description_affiliates = df['has_channel_description_affiliate'].sum()
any_affiliates = df['has_any_affiliate'].sum()
business_inquiries = df['has_business_inquiry'].sum()
affiliate_percentage = (any_affiliates / total_videos) * 100

In [11]:
#eda : notebook 3 analyses
#eda_p8 : 
word_list = []
list_to_ignore = ["for","the","a","me","my","in","for","but","of","this","that","The","with","is","you","&","your","it","do","be","by","so","What","what","With","all","i","I","if","Why","why"]
list_to_ignore = set([word.lower() for word in list_to_ignore])
for _, row in df.iterrows():
    new_words = row["title"].split()
    word_list = word_list + [word.lower() for word in new_words if word not in list_to_ignore]


def compare_means(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    print(f"Comparing means of {thing_to_take_average_of} with and without {thing_to_check_for}.")
    idxs = (df[where_to_look].str.count(thing_to_check_for) > 0)
    print(df.loc[idxs, thing_to_take_average_of].mean())
    print(df.loc[~idxs, thing_to_take_average_of].mean())

def get_mean_with_word(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    idxs = (df[where_to_look].str.count(thing_to_check_for) > 0)
    return df.loc[idxs, thing_to_take_average_of].mean()

def get_mean_without_word(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    idxs = (df[where_to_look].str.count(thing_to_check_for) == 0)
    return df.loc[idxs, thing_to_take_average_of].mean()

schema = pd.DataFrame()
words =["dupe","cheap","drugstore"]# add more words
column_to_take_average_of = "likes_per_subscriber"
where_to_look_for_word = "title"
schema["word"] = words
# Adds column, containing means of likes_per_subscriber of entries with the substring
schema['with_string'] = schema["word"].apply(lambda word : get_mean_with_word(df, word, column_to_take_average_of, where_to_look_for_word))
# Adds column, containing means of likes_per_subscriber of entries without the substring
schema['without_string'] = schema["word"].apply(lambda word : get_mean_without_word(df, word, column_to_take_average_of, where_to_look_for_word))
# Adds column, containing means of likes_per_subscriber of entries without the substring
schema['difference'] = schema['with_string']-df["likes_per_subscriber"].mean()

#finding diffrence between with and without substring
schema = schema.sort_values('difference', ascending=False)

In [12]:
#eda : notebook 4 analyses
#eda_p9 : ad impact
# subdf = df[['title', 'text', 'id', 'likes', 'viewCount', 'commentsCount', 'duration_in_seconds']]

# remove rows with NaN values
# subdf = subdf.dropna()
# print rows with 'ad' as a substring in the 'title' column or in the 'text' column
# DataFrame for rows where 'title' contains 'ad'
df['hasAdinTitle'] = df['title'].str.lower().str.contains('ad|sponsored|collaboration|promo|partner|affiliate|paid|gift', case=False, na=False).astype(int)
df['hasAdinText'] = df['text'].str.lower().str.contains('ad|sponsored|collaboration|promo|partner|affiliate|paid|gift', case=False, na=False).astype(int)

df_stats = df.groupby(['hasAdinTitle', 'hasAdinText'])[['viewCount', 'likes', 'commentsCount']].agg(['mean', 'median', 'count'])
# print(df_stats)

#eda_p10 : duration impact on views, likes and comments
filtered_df = df[df['datetime_date'] >= '2024-10-15']
filtered_df['duration_bin'] = pd.cut(filtered_df['duration_in_seconds'], bins=[0, 30, 60, float('inf')], labels=['Short', 'Medium', 'Long'])

# Perform t-tests

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['duration_bin'] = pd.cut(filtered_df['duration_in_seconds'], bins=[0, 30, 60, float('inf')], labels=['Short', 'Medium', 'Long'])


In [13]:
# Calculate the new target variables
df['engagement_per_view'] = (df['likes'] + df['commentsCount']) / df['viewCount']
df['views_per_subscriber'] = df['viewCount'] / df['numberOfSubscribers']

In [15]:
# plots for eda_p9
df_corr = df[['viewCount', 'likes', 'commentsCount', 'hasAdinTitle', 'hasAdinText']]

corr_matrix = df_corr.corr(method='pearson')
df_corr2 = filtered_df[['duration_in_seconds', 'viewCount', 'likes', 'commentsCount']]

pearson_corr = df_corr2.corr(method='pearson')
spearman_corr = df_corr2.corr(method='spearman')


In [16]:
#eda : notebook 5 analyses
#eda_p11 : 

# Standardizing engagement-related variables
engagement_vars = ["likes", "commentsCount", "viewCount", "numberOfSubscribers"]

# Apply Z-score standardization
for var in engagement_vars:
    df[f"{var}_std"] = zscore(df[var])

# I will also compute two engagement metrics:
# Engagement per Subscriber (to assess subscriber loyalty, but won't reflect new viewers' interactions)
# and Engagement per View (to assess how engaging the content is to viewers, but this can be influenced by algorithms)
df["Engagement_per_Subscriber"] = (df["likes"] + df["commentsCount"]) / (df["numberOfSubscribers"] + 1)
df["Engagement_per_View"] = (df["likes"] + df["commentsCount"]) / (df["viewCount"] + 1)


# Ensure no NaN or infinite values in the engagement metrics
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=["Engagement_per_Subscriber", "Engagement_per_View"], inplace=True)

verified = df[df["isChannelVerified"] == True].copy()
non_verified = df[df["isChannelVerified"] == False].copy()

# Compute T-tests and Effect Sizes (Cohen's d)
t_test_results = { "Metric": [], "T-test p-value": [], "Effect Size (Cohen's d)": []}

def cohen_d(x, y):
    return (np.mean(x) - np.mean(y)) / np.sqrt((np.var(x, ddof=1) + np.var(y, ddof=1)) / 2)

for metric in ["likes_std", "commentsCount_std", "viewCount_std", "numberOfSubscribers_std", "Engagement_per_Subscriber", "Engagement_per_View"]:
    ttest_p = ttest_ind(verified[metric], non_verified[metric], nan_policy='omit').pvalue
    d_value = cohen_d(verified[metric].dropna(), non_verified[metric].dropna())

    t_test_results["Metric"].append(metric)
    t_test_results["T-test p-value"].append(ttest_p)
    t_test_results["Effect Size (Cohen's d)"].append(d_value)

# Convert to DataFrame and display results
t_test_df = pd.DataFrame(t_test_results)

# Analysis 4 : Verified ?

- > Videos from verified channels have significantly more likes, views, comments, engagement and subscribers than those from non-verified channels
- >  Videos from verified channels have higher engagement but lower views/subscriber.

In [18]:
#Analysis 6

In [19]:
#eda : notebook 6 analyses
#eda_p12 : impact of time of day, month
df1 = df.copy()

df1['datetime'] = pd.to_datetime(df['date'])

df1['hour'] = df1['datetime'].dt.hour
df1['day_of_week'] = df1['datetime'].dt.dayofweek  # 0 is Monday, 6 is Sunday
df1['month'] = df1['datetime'].dt.month
df1['year'] = df1['datetime'].dt.year

day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df1['day_name'] = df1['day_of_week'].apply(lambda x: day_names[int(x)] if pd.notnull(x) else None)

numeric_columns = ['viewCount', 'likes', 'commentsCount', 'hour', 'day_of_week', 'month', 'year']
numeric_df = df1[numeric_columns].copy()

df1['engagement_rate'] = (df1['likes'] + df1['commentsCount']) / df1['viewCount'] * 100

monthly_metrics = df1.groupby('month').agg({
    'engagement_rate': 'mean'
}).reset_index()

month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_metrics['month_name'] = monthly_metrics['month'].apply(lambda x: month_names[int(x)-1] if pd.notnull(x) else None)
hourly_metrics = df1.groupby('hour').agg({
    'engagement_rate': 'mean'
}).reset_index()

hourly_metrics['hour_label'] = hourly_metrics['hour'].apply(lambda x: f"{int(x)}:00")

Analysis 7

In the following section we will see if mentioning a popular brand has an impact on post performance. 
We have hand-selected 30 popular beauty brands; 15 of these are skincare brands and 15 of these are makeup brands. This list is subjective and may not be exhaustive but should cover a good number of popular brands. 

Brands:
Makeup:
1) Natasha Denona
2) Tower 28
3) Pat McGrath
4) Urban Decay
5) ColourPop
6) Fenty Beauty
7) E.L.F. cosmetics
8) Nyx professional makeup
9) Essence
10) Benefit Cosmetics
11) Anastasia Beverly Hills
12) Tarte
13) Milk Makeup
14) Maybelline
15) Oden's Eye

Skincare:
1) The Ordinary
2) Beauty of Josean
3) Bubble
4) Paula's Choice
5) Cerave
6) Good Molecules
7) Cosrx
8) Olive Young
9) Dennis Grossman
10) Skinfix
11) Drunk Elephant
12) La Roche-Posay
13) Supergoop
14) Glow Recipe
15) Rhode

In [21]:
#eda : notebook 7 analyses
#eda_p13 : brand impact

mentions = ["natasha denona", "natashadenona", "denona", "tower 28", "tower28", "pat mcgrath", "pmg labs", "mcgrath", "patmcgrath"]
mentions += ["urban decay", "urbandecay", "colourpop", "colorpop", "colour pop", "fenty", "e.l.f.", "elf", "nyx", "essence", "benefit"]
mentions += ["anastasia", "abh", "tarte", "milk", "maybelline", "oden's eye", "oden'seye", "odenseye", "the ordinary", "theordinary"]
mentions += ["beauty of josean", "josean", "bubble", "paula's choice", "paula'schoice", "paulaschoice", "cerave", "good molecules"]
mentions += ["cosrx", "olive young", "oliveyoung", "grossman", "skinfix", "drunk elephant", "drunkelephant", "roche-posay", "roche posay", "rocheposay"]
mentions += ["supergoop", "glow recipe", "glowrecipe", "rhode"]
text_list = []

for item in df["text"]:
    x = item if type(item) == str else '' 
    text_list.append(x)

df["text"] = np.array(text_list)
df["title plus desc"] = df["title"] + df["text"] #Creating a single column of both title and description so that my for loop works in the next step. 

popb_list = []
for item in df["title plus desc"]:
    item = item.lower()

    ment_bool = False
    for ment in mentions:
        if (ment in item):
            ment_bool = True

    popb_list.append(ment_bool) 
    
df["popular_brand"] = np.array(popb_list) 
df_yes = df.loc[  df["popular_brand"] == True]
df_no = df.loc[  df["popular_brand"] == False]
# # Replace whitespaces in column names with underscores
# df.columns = df.columns.str.replace(' ', '_')

Here are my initial thoughts based on the analysis: Mentioning popular brands seems to significantly improve views. It seems to improves likes/comments, but not necessarily significantly. This is quite strange! One reason for this might be because these posts tend to mention popular brands in hashtags, which improve views, even though people overall are not more likely to comment or like on your post just because you mentioned a popular brand.

The above t-test seems to indicate that yes, posts which mention a popular brand tend to have higher views on average.
I would recommend a simialar t-test be done on the testing set for our final analysis. 

In [24]:
#eda : notebook 8 analyses
#eda_p14 : prime time

df1 = df.copy()

df1['datetime'] = pd.to_datetime(df['date'])

df1['hour'] = df1['datetime'].dt.hour
df1['day_of_week'] = df1['datetime'].dt.dayofweek  # 0 is Monday, 6 is Sunday
df1['month'] = df1['datetime'].dt.month
df1['year'] = df1['datetime'].dt.year

day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df1['day_name'] = df1['day_of_week'].apply(lambda x: day_names[int(x)] if pd.notnull(x) else None)

numeric_columns = ['viewCount', 'likes', 'commentsCount', 'hour', 'day_of_week', 'month', 'year']
numeric_df = df1[numeric_columns].copy()

df1['engagement_rate'] = (df1['likes'] + df1['commentsCount']) / df1['viewCount'] * 100

monthly_metrics = df1.groupby('month').agg({
    'engagement_rate': 'mean'
}).reset_index()

month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_metrics['month_name'] = monthly_metrics['month'].apply(lambda x: month_names[int(x)-1] if pd.notnull(x) else None)
# Group data by hour and create lists of engagement rates for each hour
hour_groups = [df1[df1['hour'] == hour]['engagement_rate'].values for hour in range(24)]
# Remove any empty groups
hour_groups = [group for group in hour_groups if len(group) > 0]

# 2. Pairwise t-tests
df_for_tukey = df1[['hour', 'engagement_rate']].copy()
df_for_tukey['hour'] = df_for_tukey['hour'].astype(str)  # Convert hour to string for the test

tukey_results = pairwise_tukeyhsd(
    df_for_tukey['engagement_rate'],
    df_for_tukey['hour'],
    alpha=0.05
)

prime_engagement = df1[df1['hour'].isin(prime_hours)]['engagement_rate']
non_prime_engagement = df1[~df1['hour'].isin(prime_hours)]['engagement_rate']



df['datetime'] = pd.to_datetime(df['date'])
df['hour'] = df['datetime'].dt.hour
df["prime_hour"] = df['hour'].isin(prime_hours)

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


NameError: name 'prime_hours' is not defined

In [None]:
#plot for eda_p14
correlation_matrix = numeric_df.corr()

# Analysis 8 - Keywords

- > Summary of results: Posts with keywords in the "korean", "speed", and "product" keyword groups seem to perform well with regard to likes/subscriber and views/subscriber, with "korean" being the top performer.


In the following section, we will check to see which keywords are good to use in the title and/or description to boost performance metrics.

In [None]:
#I was wondering if there were keywords I wasn't thinking of - just from some bias I have, so instead I search for all words used in every title
#And order them by frequency. 
#I then tried to remove the obvious articles and conjunctions, but somes still come through, maybe some blank space or smthn subtle? 

word_list = []
list_to_ignore = ["for","the","a","me","my","in","for","but","of","this","that","The","with","is","you","&","your","it","do","be","by","so","What","what","With","all","i","I","if","Why","why"]
list_to_ignore = set([word.lower() for word in list_to_ignore])
for _, row in df.iterrows():
    new_words = row["title"].split()
    word_list = word_list + [word.lower() for word in new_words if word not in list_to_ignore]

In [None]:
#streamlining the "presence of a substring" function Rachael was implementing earlier - Jo

def compare_means(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    print(f"Comparing means of {thing_to_take_average_of} with and without {thing_to_check_for}.")
    idxs = (df[where_to_look].str.count(thing_to_check_for) > 0)
    print(df.loc[idxs, thing_to_take_average_of].mean())
    print(df.loc[~idxs, thing_to_take_average_of].mean())

def get_mean_with_word(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    idxs = (df[where_to_look].str.count(thing_to_check_for) > 0)
    return df.loc[idxs, thing_to_take_average_of].mean()

def get_mean_without_word(df, thing_to_check_for, thing_to_take_average_of, where_to_look="title"):
    idxs = (df[where_to_look].str.count(thing_to_check_for) == 0)
    return df.loc[idxs, thing_to_take_average_of].mean()

# Identify popular video topics/formats and see if videos that cover these topics perform better than average. 
# For example, one popular video format is "speed reviews."
#"speed reviews", "haul", "dupe" , "GRWM"  etc


# Create a dataframe
schema = pd.DataFrame()

words =["dupe","cheap","drugstore"]# add more words
column_to_take_average_of = "likes_per_subscriber"
where_to_look_for_word = "title"

#added color vs colour to see if there was an american bias
#try vs tryon, since try includes both tryon, as well as "trying" 
#celebrity names to see if people want to copy signature looks?
#asmr, grwm, vlog have secondary content separate from the objective makeup 
#retail stores to see if people are looking to purchase
#ten - trying to see if people like lists? i.e. "my top ten"

schema["word"] = words


# Adds column, containing means of likes_per_subscriber of entries with the substring
schema['with_string'] = schema["word"].apply(lambda word : get_mean_with_word(df, word, column_to_take_average_of, where_to_look_for_word))

# Adds column, containing means of likes_per_subscriber of entries without the substring
schema['without_string'] = schema["word"].apply(lambda word : get_mean_without_word(df, word, column_to_take_average_of, where_to_look_for_word))


# Adds column, containing means of likes_per_subscriber of entries without the substring
schema['difference'] = schema['with_string']-df["likes_per_subscriber"].mean()


#finding diffrence between with and without substring

schema = schema.sort_values('difference', ascending=False)

# show dataframe 

In [None]:
#keyword groups
comparison_list = ["unbox","haul","review","try","vs"]
skills_list=["tip","trick","hack","tutorial"]
skincare_list = ["skincare","mask","toner"]#"sunscreen","acne","clean","snail","serum"
product_list=["tint","lipgloss","blush","balm","foundation","lipstick","concealer","eyeshadow","mascara","oil"] #
speed_list=["short","speed","quick","fast","routine"]
#adjective_list=["color","colour","shade","swatch"]
#brand_noun_list=["fenty","dior"]
budget_list=["dupe","cheap","drugstore"] #"budget","affordable"
self_ref_list=["viral","short","popular","fav","best","cute","easy","trend","makeup","beauty"]
acronym_list=["grwm","ootd","asmr"]
kniche_list=["kbeauty","korean"]
#texture_list=["jelly","gel","matte","glitter"]
#season=["summer","winter","fall","valentine","spring"]

def contains_substring(text, substring_list):
    for substring in substring_list:
        if substring.lower() in text.lower():
            return True
    return False

def contains_products(text):
    return 1 * contains_substring(text, product_list)
def contains_budget(text):
    return 1 * contains_substring(text, budget_list)
def contains_self_ref(text):
    return 1 * contains_substring(text, self_ref_list)
def contains_acronym(text):
    return 1 * contains_substring(text, acronym_list)
def contains_kniche(text):
    return 1 * contains_substring(text, kniche_list)
def contains_speed(text):
    return 1 * contains_substring(text, speed_list)
def contains_skills(text):
    return 1 * contains_substring(text, skills_list)
def contains_comparison(text):
    return 1 * contains_substring(text, comparison_list)
def contains_skincare(text):
    return 1 * contains_substring(text, skincare_list)


#skincare and acronym are somewhat poorly performing, but I left them in for now, and we can decide?
#the three groups I would recommend are "comparing_products","skills/teach","speed" and "self_ref". 
#korean is there because it just does very well, would be hard to ignore its effect but doesnt really fit into a group?
#budget is there because...it feels intuitive to mention, just does moderately well

df = df.copy()

df["product"] = df["title"].apply(contains_products)
df["budget"] = df["title"].apply(contains_budget)
df["self_ref"] = df["title"].apply(contains_self_ref)
df["acronym"] = df["title"].apply(contains_acronym)
df["korean"] = df["title"].apply(contains_kniche)
df["speed"] = df["title"].apply(contains_speed)
df["skills/teach"] = df["title"].apply(contains_skills)
df["skincare"] = df["title"].apply(contains_skincare)
df["comparing_products"] = df["title"].apply(contains_comparison)

#The following value counts check to make sure there is a decent amount of posts from each category

keywords = ["product", "skills/teach", "speed", "comparing_products", "self_ref", "budget", "korean"]


In [None]:
df.columns

In [None]:
features = ["isChannelVerified", "any_ht", "commentsCount", "isChannelVerified", "likes", "numberOfSubscribers", "text", "title", "viewCount", "views_per_subscriber", 
            "duration_in_seconds", "date", "hashtag_indicator", "has_any_affiliate", "hasAdinTitle", "hasAdinText", "Engagement_per_Subscriber", 
            "Engagement_per_View", "popular_brand", "prime_hour", "product", "skills/teach", "speed", "comparing_products", "self_ref", "budget", "korean"]

df[features].to_csv("../data/new/no_early_dates_all_features_test.csv")