Visualization of N-gram Frequencies

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import string

# Load the dataset
data = pd.read_csv('main.csv')

# Function to clean and prepare text
def clean_text(text):
    # Convert to lower case and remove punctuation
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply text cleaning
data['cleaned'] = data['cleaned'].apply(clean_text)

# Function to create n-gram frequency plot
def plot_ngrams(n, max_features=26):
    # Create CountVectorizer object for n-grams
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', max_features=max_features)
    
    # Fit and transform the data
    ngrams = vectorizer.fit_transform(data['cleaned'])
    
    # Sum up their counts and get feature names
    sum_ngrams = ngrams.sum(axis=0) 
    ngrams_freq = [(word, sum_ngrams[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    
    # Sort n-grams by frequency
    ngrams_freq = sorted(ngrams_freq, key=lambda x: x[1], reverse=True)
    words, freqs = zip(*ngrams_freq)
    
    # Create a DataFrame
    df_ngrams = pd.DataFrame({'N-gram': words[1:], 'Frequency': freqs[1:]})
    
    # Plot using Plotly
    if n==1:
        grams='Uni'
    elif n==2:
        grams='Bi'
    elif n==3:
        grams='Tri'
    elif n==4:
        grams='Quad'
    fig = px.bar(df_ngrams, x='N-gram', y='Frequency', title=f'Top {25} {grams}-grams', 
                 template='plotly_dark', color='Frequency', color_continuous_scale=px.colors.sequential.Viridis)
    fig.show()

# Plot unigrams, bigrams, and trigrams


## Unigrams

plot_ngrams(1)


## Bigrams
plot_ngrams(2)


## Trigrams
plot_ngrams(3)

## Quadgrams
plot_ngrams(4)

Topic Modeling with Latent Dirichlet Allocation

In [6]:
## Topic Modeling

from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora, models
import gensim
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Load the dataset
data = pd.read_csv('main.csv')

# Preprocess the text
def preprocess(text):
    return preprocess_string(text)

# Apply preprocessing
processed_docs = data['cleaned'].map(preprocess)

# Create a dictionary and corpus needed for Topic Modeling
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=42)

# Display the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Visualize the topics
pyLDAvis.enable_notebook()
# Visualize the topics with adjusted size
lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False, 
                                plot_opts={'width': 800, 'height': 600})  # Adjust width and height as needed
pyLDAvis.display(lda_display)

(0, '0.026*"season" + 0.024*"forc" + 0.023*"wife" + 0.022*"premier" + 0.022*"design"')
(1, '0.165*"thehil" + 0.066*"trump" + 0.019*"report" + 0.015*"dem" + 0.015*"deport"')
(2, '0.032*"lawyer" + 0.032*"joe" + 0.031*"giudic" + 0.029*"christian" + 0.012*"peopl"')
(3, '0.031*"win" + 0.022*"past" + 0.013*"state" + 0.013*"lead" + 0.011*"beat"')
(4, '0.022*"return" + 0.022*"star" + 0.022*"new" + 0.020*"deni" + 0.020*"william"')



np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)

