In [14]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from datetime import datetime
from dateutil.parser import parse
from dateutil import tz
import warnings

#nltk.download("stopwords")
#nltk.download("wordnet")



In [16]:
data_maddow = pd.read_csv('data/Maddow_transcripts.tsv',sep='\t').drop(columns='Unnamed: 0')
data_carlson = pd.read_csv('data/Tucker_transcripts.tsv',sep='\t').drop(columns='Unnamed: 0')


In [17]:
data_carlson

Unnamed: 0,url,timestamp,title,transcript
0,https://www.foxnews.com/opinion/tucker-carlson...,2018-11-27 08:39:00,Tucker Carlson: Socialism with open borders is...,Democrats want to eliminate restrictions on im...
1,https://www.foxnews.com/opinion/tucker-carlson...,2018-11-28 07:16:00,"Tucker Carlson: For the crime of forgetting, J...",Jerome Corsi addresses charges against him in ...
2,https://www.foxnews.com/opinion/tucker-carlson...,2018-11-29 11:12:00,Tucker Carlson: The word 'Russia' eliminates a...,"On Wednesday, Tucker Carlson exposes the flaws..."
3,https://www.foxnews.com/opinion/tucker-carlson...,2018-11-30 09:21:00,Tucker Carlson: The Mueller probe and what we ...,Michael Cohen pleads guilty to lying to Congre...
4,https://www.foxnews.com/opinion/tucker-carlson...,2018-12-04 08:08:00,Tucker Carlson: The Mueller probe continues to...,"On Monday, Tucker Carlson talks to Jerome Cors..."
...,...,...,...,...
808,https://www.foxnews.com/opinion/tucker-carlson...,2023-02-08 22:25:00,TUCKER CARLSON: The lies 'just kept coming' du...,Fox News host Tucker Carlson reacts to Presid...
809,https://www.foxnews.com/opinion/tucker-carlson...,2023-02-09 22:19:00,TUCKER CARLSON: Self-defense is becoming illeg...,WARNING: Graphic footage—Fox News host Tucker ...
810,https://www.foxnews.com/opinion/tucker-carlson...,2023-02-13 22:23:00,"TUCKER CARLSON: Food, water, energy and infras...",Fox News host Tucker Carlson reacts to leaders...
811,https://www.foxnews.com/opinion/tucker-carlson...,2023-02-14 22:51:00,TUCKER CARLSON: Mayor Pete is completely incom...,Fox News host Tucker Carlson calls out the Bid...


### Stop word removal 

In [3]:
# remove stop words 

def remove_stopwords(text, stop_word_list = []):
    # first ensure stop words are lower cased 
    stop_word_list = [s.lower() for s in stop_word_list]
    
    # append any new stop words if they exist
    if len(stop_word_list) == 0: 
        stop_words = set(stopwords.words("english"))
    else: 
        stop_words = set(stopwords.words("english"))
        stop_words.update(stop_word_list)
        
    words = re.findall(r'\b\w+\b', text)
    words_without_stopwords = [word.lower() for word in words if word.lower() not in stop_words]
    return " ".join(words_without_stopwords)

In [4]:
stop_words_to_add  = ['from', 'tucker', 'carlson', 'fox', 'news', 'channel', 'host', 'evening', 'welcome', 
                   'tonight','like','kind','people','know','knew','knowing','tonight','go','goes','going',
                   'went','tell','told','telling','say','said','saying','talk','talked','talking','year',
                   'think','thinking','thought','happen','happening','happened','thing','time','right', 
                   'country', 'countries','america', 'american', 'want','state','work','working','worked',
                   'point','call','calling', 'called', 'actual','actually','year','years','live','living',
                   'rachel','maddow', 'today', 'first', 'one', 'us', 'would', 'get', 'new', 'years', 'way', 
                   'well', 'even', 'velshi','also','see','21','course','lot', 'really', 'could','fact', 
                   'last', 'never', 'thank', 'u', 'got', 'much', 'many', 'video', 'may', 'back', 'quote',
                   'day', 'one','well','would','even','got','way','united','states','us','every','really',
                   'lot','last','could']

In [5]:
data_maddow['stop words removed'] = [remove_stopwords(text, stop_words_to_add) for text in data_maddow.transcript]
data_carlson['stop words removed'] = [remove_stopwords(text, stop_words_to_add) for text in data_carlson.transcript]

### Lemmatization

In [6]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_words)

In [7]:
data_maddow['stop words and lemmatization'] = [lemmatize_text(text) for text in data_maddow['stop words removed']]
data_carlson['stop words and lemmatization'] = [lemmatize_text(text) for text in data_carlson['stop words removed']]

### Converting to Datetime 

In [8]:
# removing any extra dates/characters due to the 'updated' timestamp on some transcripts
def keep_first_datetime(datetime_str):
    first_part = datetime_str.split("/")[0]
    return first_part.strip()

In [9]:
# convert timestamp to datetime object

def string_to_datetime(datetime_str):
    est_tz = tz.gettz('EST')
    datetime_obj = parse(datetime_str, fuzzy=True, tzinfos={ 'EST': est_tz })
    return datetime_obj
    

In [10]:
data_maddow.timestamp = [keep_first_datetime(timestamp) for timestamp in data_maddow.timestamp]
data_carlson.timestamp = [keep_first_datetime(timestamp) for timestamp in data_carlson.timestamp]

In [11]:
warnings.filterwarnings("ignore", category=UserWarning, module="dateutil")

data_maddow.timestamp = [string_to_datetime(timestamp) for timestamp in data_maddow.timestamp]
data_carlson.timestamp = [string_to_datetime(timestamp) for timestamp in data_carlson.timestamp]

### Eliminate timezone from maddow timestamps

In [12]:
data_maddow['timestamp'] = data_maddow['timestamp'].dt.tz_convert(None)

### Export tsv files

In [13]:

# data_maddow.to_csv('data/from_git/Maddow_cleaned.tsv', sep="\t")
# data_carlson.to_csv('data/from_git/Carlson_cleaned.tsv', sep="\t")


data_maddow.to_csv('data/Maddow_cleaned.tsv', sep="\t")
data_carlson.to_csv('data/Carlson_cleaned.tsv', sep="\t")