## Feature Engineering

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta
import time
import re
import os
from pandas_summary import DataFrameSummary

In [18]:
data = pd.read_excel('final_data_folder\Combined\Output.xlsx')

In [19]:
def timeToSeconds(timeStr):
    x = time.strptime(timeStr.split(',')[0],'%H:%M:%S')
    return timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()

In [20]:
import emoji
EMOJIS = emoji.UNICODE_EMOJI["en"]

def extract_emojis(s):
    return ''.join(c for c in s if c in EMOJIS)

## Lemmatise Text 

In [21]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lem = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return " ".join(lem)

data['title_lemmatized'] = data['title'].apply(lemmatize_text)

In [22]:
def search(s, search_conditions):
    for i in search_conditions:
        if re.search(i,s):
            return 1
    return 0

In [23]:
s = 'bitcoin to $41,100'
search_conditions = ['to \$\d+', 'to \d+', 'predict'] # can add more stuff to search for

# example of 'to \$\d+' is 'to $41000'
# example of 'to \d+' is 'to 41000'

data['HasPrediction'] = data['title'].apply(lambda x: search(x, search_conditions))

## Adding features learnt from EDA and domain knowledge

In [24]:
data['HasHashtag'] = [1 if '#' in i else 0 for i in data['title'].values]                                                 # returns 1 if there are hastags, 0 otherwise
data['HashtagCount'] = data['title'].apply(lambda x: x.count('#'))                                                        # counts the number of hashtags (#)
data['ExclamationCount'] = data['title'].apply(lambda x: x.count('!'))                                                    # counts the number of exclamation marks (!)
data['QuestionCount'] = data['title'].apply(lambda x: x.count('?'))                                                       # counts the number of question marks (?)
data['NumWords'] = data['title'].str.split().apply(len)                                                                   # counts the number of words in the title

data['HasCaps'] = data['title'].str.split().apply(lambda x: any(np.where(len(word)>1,word.isupper(),0) for word in x))    # returns 1 if a title has at least one word with all caps
data['HasCaps'] = data['HasCaps'].astype(int)                                                                             # the word has to have at least two letters to be valid

current_date = datetime.now()                                                                                         
data['date'] = pd.to_datetime(data['date'])
data['DaysPosted'] = data['date'].apply(lambda x: current_date - x)
data['DaysPosted'] = data['DaysPosted'].apply(lambda x: x.days)                                                           # counts number of days since the video was posted

data['Total Seconds'] = data['Duration'].apply(lambda timeStr :timeToSeconds(timeStr))                        # Converts the time string to total seconds
data['Total Seconds bins'] = pd.cut(data['Total Seconds'], bins = 10)

data['emojis'] = data['title'].apply(lambda s : extract_emojis(s))

data['target'] = data['Views']/data['Subscribers']

## We drop duplicate titles and only keep most recent titles

In [25]:
data_clean = data.sort_values('date').drop_duplicates('title',keep='last')                                                

In [26]:
dfs2 = DataFrameSummary(data_clean)
dfs2.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
Unnamed: 0,26196,1691,0,0%,numeric
Channel Name,26196,114,0,0%,categorical
title,26196,26196,0,0%,unique
date,26196,26172,0,0%,date
thumbnailUrl,26196,26196,0,0%,unique
Views,26196,19210,0,0%,numeric
Description,25957,18552,239,0.91%,categorical
Duration,26196,4556,0,0%,categorical
Like,26196,6394,0,0%,numeric
Subscribers,26196,112,0,0%,numeric


In [27]:
data_clean['emojis'].value_counts()

         22127
🚨          502
🔴          234
⚠          225
⚠⚠         152
         ...  
🌎            1
🚨⛔           1
✨💰✨          1
❌😰😰❌         1
🙉✅✨❤💸        1
Name: emojis, Length: 801, dtype: int64

In [28]:
cwd = os.getcwd() 
path = os.path.join(cwd +"\\final_data_folder" )

In [29]:
data_clean.to_excel(path + "\\Combined\\ProcessedData.xlsx", index=False)