In [14]:
import pandas as pd
import numpy as np
import re
import nltk
import textstat
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from datetime import datetime


nltk.download("stopwords")
nltk.download("wordnet")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rache\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rache\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
data = pd.read_csv('Rachel_transcripts_2.9.23.tsv',sep='\t').drop(columns='Unnamed: 0')

In [16]:
sample = data.transcript[0]

sample

'SummaryWith public trust at a low, the Supreme Court opens new term today. Members of the far right militia group Oath Keepers face seditious conspiracy charges over January 6. Transcript CHRIS HAYES, MSNBC HOST, "ALL IN": Yeah, we should probably have some reporters looking at the volleyball facilities in Baton Rouge.Ashton Pittman, who`s been doing very good reporting on this, thank you very much.ASHTON PITTMAN, JOURNALIST: Thank you.HAYES: That`s "ALL IN" on this Monday night.THE RACHEL MADDOW SHOW starts right now.Good evening, Rachel.RACHEL MADDOW, MSNBC HOST: Good evening, Chris. Thank you for saying that nice thing about my new podcast.HAYES: Oh, it`s so good. It`s good and it`s -- I`m excited to talk to you about it. It made me, oh, good, she is so good at this, she`s so good at this. What is in her -- what is her bag of magic tricks again?MADDOW: What you heard is not -- is like still the rough draft. Like it`s not --HAYES: I know. You and all your -- you and all your weird, 

### Stop word removal 

In [17]:
# Need to lowercase stop words, lowercase text to compare apples to apples 

# remove stop words 

def remove_stopwords(text, stop_word_list = []):
    # first append any new stop words if there are any
    if len(stop_word_list) == 0: 
        stop_words = set(stopwords.words("english"))
    else: 
        stop_words = set(stopwords.words("english"))
        stop_words.update(stop_word_list)
        
    words = re.findall(r'\b\w+\b', text)
    words_without_stopwords = [word for word in words if word.lower() not in stop_words]
    return " ".join(words_without_stopwords)

In [18]:
sample = remove_stopwords(sample)

In [19]:
sample

'SummaryWith public trust low Supreme Court opens new term today Members far right militia group Oath Keepers face seditious conspiracy charges January 6 Transcript CHRIS HAYES MSNBC HOST Yeah probably reporters looking volleyball facilities Baton Rouge Ashton Pittman good reporting thank much ASHTON PITTMAN JOURNALIST Thank HAYES Monday night RACHEL MADDOW SHOW starts right Good evening Rachel RACHEL MADDOW MSNBC HOST Good evening Chris Thank saying nice thing new podcast HAYES Oh good good excited talk made oh good good good bag magic tricks MADDOW heard like still rough draft Like HAYES know weird neurotic self loathing staff share incredible inability take compliment guess final final MADDOW true going 400 drafts heard going public HAYES Well heard still awesome MADDOW kind talk friend Thank much HAYES right MADDOW thanks home joining us hour Happy upon time long ago grand scheme things upon time biggest celebrities America pilots know Time Magazine person year used man year Time M

In [20]:
data['stop words removed'] = [remove_stopwords(text, ['Rachel', 'Maddow']) for text in data.transcript]

### Lemmatization

In [21]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_words)

In [22]:
sample = lemmatize_text(sample)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\rache/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\rache\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [23]:
#sample

In [24]:
data['stop words and lemmatization'] = [lemmatize_text(text) for text in data['stop words removed']]

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\rache/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.752.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\rache\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


### Removing 'Updated' details in timestamp and converting to Datetime

In [25]:
data.head()

Unnamed: 0,Unnamed: 0.1,URL,timestamp,transcript,stop words removed
0,0,https://www.msnbc.com/transcripts/rachel-maddo...,"Oct. 4, 2022, 1:00 AM UTC / Updated Oct. 4, 20...","SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...
1,1,https://www.msnbc.com/transcripts/rachel-maddo...,"Oct. 4, 2022, 1:00 AM UTC / Updated Oct. 4, 20...","SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...
2,2,https://www.msnbc.com/transcripts/rachel-maddo...,"Sept. 27, 2022, 1:00 AM UTC","Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...
3,3,https://www.msnbc.com/transcripts/rachel-maddo...,"Sept. 27, 2022, 1:00 AM UTC","Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...
4,4,https://www.msnbc.com/transcripts/rachel-maddo...,"Sept. 13, 2022, 1:00 AM UTC","Summary Interview with Geoffrey Berman, the U....",Summary Interview Geoffrey Berman U attorney T...


In [26]:
# removing any extra dates/characters due to the 'updated' timestamp on some transcripts
def keep_first_datetime(datetime_str):
    first_part = datetime_str.split("/")[0]
    return first_part.strip()

In [27]:
data.timestamp = [keep_first_datetime(timestamp) for timestamp in data.timestamp]

In [28]:
# this function needs work

from datetime import datetime
from dateutil.parser import parse

def string_to_datetime(datetime_str):
    datetime_obj = parse(datetime_str, fuzzy = True)
    return datetime_obj

In [29]:
data.timestamp = [string_to_datetime(timestamp) for timestamp in data.timestamp]

In [30]:
data.head()

Unnamed: 0,Unnamed: 0.1,URL,timestamp,transcript,stop words removed
0,0,https://www.msnbc.com/transcripts/rachel-maddo...,2022-10-04 01:00:00+00:00,"SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...
1,1,https://www.msnbc.com/transcripts/rachel-maddo...,2022-10-04 01:00:00+00:00,"SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...
2,2,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-27 01:00:00+00:00,"Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...
3,3,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-27 01:00:00+00:00,"Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...
4,4,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-13 01:00:00+00:00,"Summary Interview with Geoffrey Berman, the U....",Summary Interview Geoffrey Berman U attorney T...


In [31]:
# Check that the type of the timestamp column is correct
type(data.timestamp[0])

pandas._libs.tslibs.timestamps.Timestamp

In [32]:
data.to_csv('Maddow_cleaned.tsv', sep="\t")

### How many stop words are removed

### Flesch-Kincaid reading levels

In [None]:
# Use the Flesch Reading Ease formula to assess the ease of readability of each transcript (returns the Flesch Reading Ease Score)

data['flesch_reading_ease'] = [textstat.flesch_reading_ease(text) for text in data.transcript]

# Get the Flesch-Kincaid Grade level of each transcript (score of 9.3 means a ninth grader would be able to read the document)
data['flesch_kincaid_grade_level'] = [textstat.flesch_kincaid_grade(text) for text in data.transcript]

In [None]:
# Find the average Flesch Reading Ease Score across all of the Rachel Maddow transcripts


# Find the average Flesch-Kincaid Grade Level across all of the Rachel Maddow transcripts

Unnamed: 0,Unnamed: 0.1,URL,timestamp,transcript,stop words removed,flesch_reading_ease,flesch_kincaid_grade_level
0,0,https://www.msnbc.com/transcripts/rachel-maddo...,2022-10-04 01:00:00+00:00,"SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...,68.70,8.5
1,1,https://www.msnbc.com/transcripts/rachel-maddo...,2022-10-04 01:00:00+00:00,"SummaryWith public trust at a low, the Supreme...",SummaryWith public trust low Supreme Court ope...,68.70,8.5
2,2,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-27 01:00:00+00:00,"Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...,69.82,8.1
3,3,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-27 01:00:00+00:00,"Summary ""American Psychosis"" examines the rise...",Summary American Psychosis examines rise extre...,69.82,8.1
4,4,https://www.msnbc.com/transcripts/rachel-maddo...,2022-09-13 01:00:00+00:00,"Summary Interview with Geoffrey Berman, the U....",Summary Interview Geoffrey Berman U attorney T...,63.09,8.6
...,...,...,...,...,...,...,...
115,115,https://www.msnbc.com/transcripts/rachel-maddo...,2022-05-13 01:00:00+00:00,SummaryTop House Republican Kevin McCarthy is ...,SummaryTop House Republican Kevin McCarthy amo...,59.43,10.0
116,116,https://www.msnbc.com/transcripts/rachel-maddo...,2022-05-12 01:00:00+00:00,SummarySenate Republicans and Sen. Joe Manchin...,SummarySenate Republicans Sen Joe Manchin WV b...,68.81,8.5
117,117,https://www.msnbc.com/transcripts/rachel-maddo...,2022-05-12 01:00:00+00:00,SummarySenate Republicans and Sen. Joe Manchin...,SummarySenate Republicans Sen Joe Manchin WV b...,68.81,8.5
118,118,https://www.msnbc.com/transcripts/rachel-maddo...,2022-05-11 01:00:00+00:00,SummaryInterview with Rep. Pete Aguilar (D-CA)...,SummaryInterview Rep Pete Aguilar CA Interview...,61.26,9.3
