In [16]:
import pandas as pd
import json
import string
import textwrap
import nltk
import ssl
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
with open('/Users/sunmingrun/Documents/GitHub/KamalaGPT/extracted_summaries.json', 'r') as f:
    data = json.load(f)

In [12]:
print(json.dumps(data, indent=4))

[
    "Kamala Harris' campaign team is reportedly vetting retired Admiral William McRaven, who oversaw the May 2011 Navy SEAL raid that killed Osama bin Laden, as a potential 2024 running mate for Vice President Kamala Harris. Other potential candidates include Arizona Senator Mark Kelly, Pennsylvania Governor Josh Shapiro, and Kentucky Governor Andy Beshear. McRaven was commander of the U.S. Joint Special Operations Command between 2008 and 2011, and served as head of the United States Special Ops Command from August 2011 to 2014. He also served as Chancellor of the University of Texas System for over three years. As of Thursday morning, bookmaker William Hill had Senator Kelly as the firm favorite to be the Democrat's 2024 vice-presidential candidate with odds of 42.1 percent and 28.6 percent.",
    "Vice President Kamala Harris' list of potential running mates has narrowed to include one U.S. senator and several prominent Democratic governors. The Wall Street Journal reported that p

In [5]:
def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Combine all preprocessing steps
def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_words(text)
    return text


In [22]:
df = pd.DataFrame(data, columns=["summary"])
print(df)


                                               summary
0    Kamala Harris' campaign team is reportedly vet...
1    Vice President Kamala Harris' list of potentia...
2    Kamala Harris is currently vetting three poten...
3    Vice President Kamala Harris has defended Pres...
4    Pennsylvania Gov. Josh Shapiro is being consid...
..                                                 ...
665  Former White House press secretary, Sean Spice...
666                                                   
667  The Kamala Harris campaign has provided a pote...
668  Vice President Kamala Harris is expected to re...
669  Vice President Kamala Harris is reportedly pla...

[670 rows x 1 columns]


In [23]:
df['cleaned_summaries'] = df['summary'].apply(preprocess_text)

# View the cleaned summaries
print(df['cleaned_summaries'].head())
df.to_csv('/Users/sunmingrun/Documents/GitHub/ElectionGPT-repo/cleaned_summary.csv', index=False)


0    kamala harris campaign team reportedly vetting...
1    vice president kamala harris list potential ru...
2    kamala harris currently vetting three potentia...
3    vice president kamala harris defended presiden...
4    pennsylvania gov josh shapiro considered poten...
Name: cleaned_summaries, dtype: object


In [10]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned_summaries'])



In [11]:
# Calculate Relevance Score
df['relevance_score'] = tfidf_matrix.sum(axis=1)



In [16]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()
df['sentiment'] = df['cleaned_summaries'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
print(df)


                                               summary  \
0    Kamala Harris' campaign team is reportedly vet...   
1    Vice President Kamala Harris' list of potentia...   
2    Kamala Harris is currently vetting three poten...   
3    Vice President Kamala Harris has defended Pres...   
4    Pennsylvania Gov. Josh Shapiro is being consid...   
..                                                 ...   
665  Former White House press secretary, Sean Spice...   
666                                                      
667  The Kamala Harris campaign has provided a pote...   
668  Vice President Kamala Harris is expected to re...   
669  Vice President Kamala Harris is reportedly pla...   

                                     cleaned_summaries  relevance_score  \
0    kamala harris campaign team reportedly vetting...         7.662068   
1    vice president kamala harris list potential ru...         6.457972   
2    kamala harris currently vetting three potentia...         7.233046   
3  

In [17]:
# Topic Modeling
topic_distributions = lda.transform(tfidf_matrix)

# Create a DataFrame from the topic distributions
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(topic_distributions.shape[1])])

# Concatenate the original DataFrame with the topic distribution DataFrame
df = pd.concat([df, topic_df], axis=1)

# Display the updated DataFrame
print(df.head())




                                             summary  \
0  Kamala Harris' campaign team is reportedly vet...   
1  Vice President Kamala Harris' list of potentia...   
2  Kamala Harris is currently vetting three poten...   
3  Vice President Kamala Harris has defended Pres...   
4  Pennsylvania Gov. Josh Shapiro is being consid...   

                                   cleaned_summaries  relevance_score  \
0  kamala harris campaign team reportedly vetting...         7.662068   
1  vice president kamala harris list potential ru...         6.457972   
2  kamala harris currently vetting three potentia...         7.233046   
3  vice president kamala harris defended presiden...         8.448313   
4  pennsylvania gov josh shapiro considered poten...         5.541658   

   sentiment   Topic_1   Topic_2   Topic_3   Topic_4   Topic_5  
0     0.7906  0.023295  0.906983  0.023182  0.023184  0.023356  
1     0.5106  0.027173  0.027013  0.027084  0.026940  0.891790  
2     0.8625  0.902283  0.024

In [19]:
# Display the first few rows of the resulting DataFrame
print(df.head())

# Save the result for further analysis
df.to_csv('/Users/sunmingrun/Documents/GitHub/ElectionGPT-repo/nlp_analysis_results.csv', index=False)

                                             summary  \
0  Kamala Harris' campaign team is reportedly vet...   
1  Vice President Kamala Harris' list of potentia...   
2  Kamala Harris is currently vetting three poten...   
3  Vice President Kamala Harris has defended Pres...   
4  Pennsylvania Gov. Josh Shapiro is being consid...   

                                   cleaned_summaries  relevance_score  \
0  kamala harris campaign team reportedly vetting...         7.662068   
1  vice president kamala harris list potential ru...         6.457972   
2  kamala harris currently vetting three potentia...         7.233046   
3  vice president kamala harris defended presiden...         8.448313   
4  pennsylvania gov josh shapiro considered poten...         5.541658   

   sentiment   Topic_1   Topic_2   Topic_3   Topic_4   Topic_5  
0     0.7906  0.023295  0.906983  0.023182  0.023184  0.023356  
1     0.5106  0.027173  0.027013  0.027084  0.026940  0.891790  
2     0.8625  0.902283  0.024

In [6]:




# Tokenize the text
words = word_tokenize(text)

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if word.lower() not in stop_words]

print(filtered_words)
# Output: ['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']

['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [17]:
text = "Kamala Harris' campaign team is reportedly vetting retired Admiral William McRaven, who oversaw the May 2011 Navy SEAL raid that killed Osama bin Laden, as a potential 2024 running mate for Vice President Kamala Harris. Other potential candidates include Arizona Senator Mark Kelly, Pennsylvania Governor Josh Shapiro, and Kentucky Governor Andy Beshear. McRaven was commander of the U.S. Joint Special Operations Command between 2008 and 2011, and served as head of the United States Special Ops Command from August 2011 to 2014. He also served as Chancellor of the University of Texas System for over three years. As of Thursday morning, bookmaker William Hill had Senator Kelly as the firm favorite to be the Democrat's 2024 vice-presidential candidate with odds of 42.1 percent and 28.6 percent. Vice President Kamala Harris' list of potential running mates has narrowed to include one U.S. senator and several prominent Democratic governors. The Wall Street Journal reported that potential vice presidential candidates had been asked to provide vetting materials to the Harris campaign. This comes after President Joe Biden ended the presidential race and endorsed Harris. The Harris campaign has reportedly sought vetting materials from Senator Mark Kelly of Arizona, Governor Roy Cooper of North Carolina, Governor Josh Shapiro of Pennsylvania, Governor Gretchen Whitmer of Michigan, Governor J.B. Pritzker of Illinois, and Governor Tim Walz of Minnesota. However, Governor Andy Beshear of Kentucky has reportedly not been asked for vetting materials."
test_processed=preprocess_text(text)
wrapped_text = textwrap.fill(test_processed, width=80)

# Print the wrapped text
print(wrapped_text)


kamala harris campaign team reportedly vetting retired admiral william mcraven
oversaw may 2011 navy seal raid killed osama bin laden potential 2024 running
mate vice president kamala harris potential candidate include arizona senator
mark kelly pennsylvania governor josh shapiro kentucky governor andy beshear
mcraven commander u joint special operation command 2008 2011 served head united
state special ops command august 2011 2014 also served chancellor university
texas system three year thursday morning bookmaker william hill senator kelly
firm favorite democrat 2024 vicepresidential candidate odds 421 percent 286
percent vice president kamala harris list potential running mate narrowed
include one u senator several prominent democratic governor wall street journal
reported potential vice presidential candidate asked provide vetting material
harris campaign come president joe biden ended presidential race endorsed harris
harris campaign reportedly sought vetting material senator mark