# Ideas
Search for words that appear most commonly together with others (e.g., Trump, Biden, president, etc.)

In [2]:
from nltk.corpus import stopwords
from tqdm import tqdm

import numpy as np
import pandas as pd
import warnings
import spacy

warnings.filterwarnings("ignore")
en_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_lg')

In [22]:
fillers = [
    'like',
    'um',
    'uh',
    'uhm',
    'you know',
    'well',
    'so',
    'basically',
    'actually',
    'literally',
    'kind of',
    'sort of',
    'kinda',
    'sorta',
    'really',
    'totally',
    'seriously',
    'just',
    'maybe',
    'probably'
]

In [3]:
data = pd.read_csv('../data/processed/cleaned_transcripted_dataset.csv')
print(data.shape)
data.head()

(1152, 40)


Unnamed: 0,id,search_tag,create_time,video_desc,stickers_on_video,video_hashtags,video_duration_in_sec,is_duet_enabled,duet_from_id,music_id,...,video_url,transcription,video_contains_music,video_contains_male,video_contains_female,video_contains_noise,video_contains_no_energy,segments,total_music_duration,percentage_of_video_made_of_music
0,6746590978280279301,joebiden,2019-10-11 12:58:42-04:00,so the former Vice President and I are best bu...,,"joebiden, foryoupage",32,True,0,6746567236959406853,...,https://www.tiktok.com/@/video/674659097828027...,Do you know what Dab Me Up is? Do you know wha...,False,True,False,False,False,"[{'label': 'male', 'start': 0.0, 'end': 32.92}]",0.0,0.0
1,6748582567995378949,maga,2019-10-16 21:47:08-04:00,“Your organization is terrible” #maga #fyp,Trump talking to CNN reporter,"maga, fyp",20,True,0,6748577002363652870,...,https://www.tiktok.com/@/video/674858256799537...,"Go ahead, go ahead. No, not you, not you. Your...",False,True,False,False,False,"[{'label': 'male', 'start': 0.0, 'end': 20.580...",0.0,0.0
2,6752944407395175686,trump2020,2019-10-28 15:53:18-04:00,"The hat didn’t fit, but can the vid go viral s...",,"fyp, foryou, trump2020, republican",10,True,0,6744484944707406597,...,https://www.tiktok.com/@/video/675294440739517...,There's not any.,True,True,False,True,True,"[{'label': 'noise', 'start': 0.0, 'end': 2.260...",4.54,45.4
3,6755172553632926981,trump2020,2019-11-03 14:59:36-05:00,Keep America great #donaldtrump #trump2020 #tr...,Trump 20 20,"donaldtrump, trump2020, trump, trumptrain, ins...",38,True,0,6755152356901423877,...,https://www.tiktok.com/@/video/675517255363292...,If I give you one message to hold in your hear...,True,True,False,False,True,"[{'label': 'male', 'start': 0.0, 'end': 4.24},...",16.22,42.684211
4,6758547542276852998,maga,2019-11-12 17:16:18-05:00,#trump2020 #foryourpage #maga #maga #kag,,"trump2020, foryourpage, maga, maga, kag",45,False,0,6758505275898039046,...,https://www.tiktok.com/@/video/675854754227685...,"What? Did I kill them, sir? Are you kidding me?",True,True,False,True,False,"[{'label': 'music', 'start': 0.0, 'end': 11.46...",29.76,66.133333


# Preprocessing

In [5]:
data['full_text'] = data['video_desc'].fillna('') + ' ' + data['transcription'] + ' ' + data['stickers_on_video'].fillna('')
data['full_text'] = data['full_text'].str.lower()

print(data.full_text.iloc[1])

“your organization is terrible” #maga #fyp go ahead, go ahead. no, not you, not you. your organization's terrible. your organization's terrible. let's go. go ahead. quiet, quiet. go ahead, she's asking a question. don't be rude. don't be rude. no, i'm not going to give you a ques- i'm not going to give you a question. you are fake news. trump talking to cnn reporter


# Lemmatization

In [6]:
lemmas = []
for doc in tqdm(nlp.pipe(data['full_text'], disable=["tok2vec", "tagger", "parser", "attribute_ruler"], n_process=-1), total=len(data['full_text'])):
    doc_lemmas = []
    for token in doc:
        str_token = str(token)
        if not token.is_punct and str_token not in en_stopwords:
            doc_lemmas.append(token.lemma_)
    lemmas.append(' '.join(doc_lemmas))

data['full_text_lemmas'] = lemmas

100%|██████████| 1152/1152 [01:30<00:00, 12.68it/s] 


In [8]:
data.full_text_lemmas.head(5)

0    former vice president best buds joebiden foryo...
1    organization terrible maga fyp go ahead go ahe...
2    hat n’t fit vid go viral trump sees it? 😂 #fyp...
3    keep america great donaldtrump trump2020 trump...
4    trump2020 foryourpage maga maga kag kill sir k...
Name: full_text_lemmas, dtype: object

# Removing fillers

In [23]:
for filler in fillers:
    filler = ' {} '.format(filler)
    data['full_text_lemmas'] = data['full_text_lemmas'].str.replace(filler, ' ')

# Saving dataset

In [7]:
data.to_csv('../data/processed/preprocessed_lemma_dataset.csv', index=False)