In [None]:
import pandas as pd

# Import the pandas library, which is commonly used for data manipulation and analysis in Python.

data = pd.read_pickle('dtm.pkl')

# Read the data from the 'dtm.pkl' file into a pandas DataFrame. 
# The 'read_pickle' function is used to load pickled objects (in this case, a DataFrame) from a file.

data = data.transpose()

# Transpose the DataFrame using the 'transpose' method. 
# This operation swaps the rows and columns of the DataFrame, which can be useful for certain types of analysis or data manipulation.

data.head()

# Display the first few rows of the transposed DataFrame using the 'head' method. 
# This allows us to inspect the structure and content of the DataFrame.


In [None]:
# Initialize an empty dictionary to store the top 30 words for each politician
top_dict = {}

# Iterate over each column (politician) in the DataFrame
for c in data.columns:
    # Sort the values in descending order and select the top 30 words
    top = data[c].sort_values(ascending=False).head(30)
    
    # Create a list of tuples containing the top 30 words and their frequencies
    top_dict[c] = list(zip(top.index, top.values))

# Display the dictionary containing the top 30 words for each politician
top_dict


In [None]:
from collections import Counter

# Initialize an empty list to store words
words = []

# Iterate over each politician
for politician in data.columns:
    # Extract the top words for the politician from the top_dict dictionary
    top = [word for (word, count) in top_dict[politician]]
    # Add the top words to the words list
    for t in top:
        words.append(t)

# Count the occurrences of each word and select those occurring more than 6 times
word_counts = Counter(words).most_common()
add_stop_words = [word for word, count in word_counts if count > 6]
add_stop_words


In [None]:
Counter(words).most_common()

In [None]:
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words.append('t')
add_stop_words

In [None]:
#4 # Import necessary libraries
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in the cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words to the existing stop words
stop_words = list(text.ENGLISH_STOP_WORDS.union(add_stop_words))

# Initialize a CountVectorizer object with the updated stop words
cv = CountVectorizer(stop_words=stop_words)

# Fit and transform the data using the CountVectorizer
data_cv = cv.fit_transform(data_clean.transcript)

# Convert the transformed data into a DataFrame
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())

# Set the index of the DataFrame to match the index of the cleaned data
data_stop.index = data_clean.index

# Pickle the CountVectorizer object for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))

# Pickle the transformed data for later use
data_stop.to_pickle("dtm_stop.pkl")


In [None]:
# Let's make some word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
import matplotlib.pyplot as plt

# Set the default figure size for plots
plt.rcParams['figure.figsize'] = [16, 6]

# List of full names corresponding to each politician
full_names = ['Abraham Lincoln','BR Ambedkar', 'Boris Johnson','Willy Brandt','Desmond Tutu','Franklin Roosevelt','Mahatama Gandhi','Nelson Mandela','Margaret Thatcher',
              'Donald Trump','George Washington','Winston Churchill']

# Create subplots for each politician's word cloud
for index, politician in enumerate(data.columns):
    # Generate a word cloud for the current politician's transcript
    wc.generate(data_clean.transcript[politician])
    
    # Create a subplot for the current politician
    plt.subplot(3, 4, index+1)
    
    # Display the word cloud with bilinear interpolation
    plt.imshow(wc, interpolation="bilinear")
    
    # Turn off axis for cleaner presentation
    plt.axis("off")
    
    # Set title for the subplot with the politician's full name
    plt.title(full_names[index])
    
# Show the plot with all subplots
plt.show()


In [None]:
# Identify the non-zero items in the document-term matrix, meaning that the word occurs at least once
import numpy as np

# Initialize an empty list to store unique word counts for each politician
unique_list = []

# Iterate over each politician's column in the document-term matrix
for politician in data.columns:
    # Count the non-zero items in the politician's transcript and append to the list
    unique_list.append(np.count_nonzero(data[politician].to_numpy()))

# Create a new DataFrame that contains the count of unique words for each politician
data_words = pd.DataFrame(list(zip(full_names, unique_list)), columns=['politician', 'unique_words'])

# Sort the DataFrame based on the number of unique words used by each politician
data_unique_sort = data_words.sort_values(by='unique_words')
data_unique_sort


In [None]:
import plotly.express as px

# Assuming 'data_unique_sort' is the DataFrame containing unique word counts
fig = px.bar(data_unique_sort, x='politician', y='unique_words', title='Unique Words Count for Each politician', labels={'politician': 'politician', 'unique_words': 'Unique Words Count'})

# Add hover data for better interactivity
fig.update_traces(hovertemplate='politician: %{x}<br>Unique Words Count: %{y}')

# Set layout parameters for better appearance
fig.update_layout(xaxis_title='politician', yaxis_title='Unique Words Count', xaxis_tickangle=-45, showlegend=False)

# Show the plot
fig.show()
