In [1]:
# import libraries
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from transformers.pipelines import pipeline
from bertopic.representation import TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import ZeroShotClassification
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
import pandas as pd
import os
import zipfile
from tqdm.notebook import tqdm
import re
import umap


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read data. 
df_flexibilities=pd.read_csv("data/interim_data/df_flexibilities.csv")
df_HHFKAs=pd.read_csv("data/interim_data/df_HHFKA.csv")

## Analyzing 2017 and 2020 together. 

### Pre-processing

In [3]:

def preprocess_comments(df):
    """
    Preprocess the 'comments' column in the input DataFrame by removing 
    certain unwanted characters like \r, \n, \t, <br/>, and <br>.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame that contains a 'comments' column.
    
    Returns:
    pd.DataFrame: A DataFrame with the 'comments' column cleaned.
    """
    # Preprocessing: remove \r, \n, \t, <br/>, and <br> from 'comments' column
    df['comment'] = df['comment'].replace({r'\r': '', r'\n': '', r'\t': '', r'<br/>': '', r'<br>': ''}, regex=True)
    
    return df

def remove_duplicate_comments(df):
    """
    Remove rows where the 'comments' column contains identical values (duplicate comments).
    
    Parameters:
    df (pd.DataFrame): The input DataFrame that contains a 'comments' column.
    
    Returns:
    pd.DataFrame: A DataFrame with duplicate comments removed.
    """
    # Remove duplicate rows based on the 'comments' column
    df_cleaned = df.drop_duplicates(subset=['comment'], keep='first').reset_index(drop=True)
    
    return df_cleaned


# preprocessing and removing duplicate comments across all datasets. 
df_flexibilities=preprocess_comments(df_flexibilities)
df_HHFKAs=preprocess_comments(df_HHFKAs)

df_flexibilities=remove_duplicate_comments(df_flexibilities)
df_HHFKAs=remove_duplicate_comments(df_HHFKAs)

In [4]:
def analyze_word_counts(df):
    """
    Analyze the word counts in the 'comments' column of the input DataFrame.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame that contains a 'comments' column.
    
    Returns:
    dict: A dictionary containing the range, mean, median, and standard deviation of word counts.
    """
    # Calculate word count for each comment
    df['word_count'] = df['comment'].apply(lambda x: len(str(x).split()))

    # Calculate statistics
    word_count_range = (df['word_count'].min(), df['word_count'].max())
    word_count_mean = df['word_count'].mean()
    word_count_median = df['word_count'].median()
    word_count_std = df['word_count'].std()

    # Store the results in a dictionary
    stats = {
        'range': word_count_range,
        'mean': word_count_mean,
        'median': word_count_median,
        'std': word_count_std
    }
    
    return stats

In [5]:
analyze_word_counts(df_flexibilities)

{'range': (1, 2589),
 'mean': 189.36653386454182,
 'median': 108.5,
 'std': 278.00219903842026}

In [6]:
analyze_word_counts(df_HHFKAs)

{'range': (3, 3579),
 'mean': 191.3264942016057,
 'median': 101.0,
 'std': 297.23630345580835}

### Implementing BERTopic

In [8]:
def implement_bertopic(df, version="v1"):
    comments = df["comment"]
    
    # Set random seed for reproducibility
    random_state = 1
    
    # Define models and set random_state where applicable
    if version=="v1":
        representation_model = MaximalMarginalRelevance(diversity=0.9)
    else:
        representation_model = KeyBERTInspired()
    ctfidf_model = ClassTfidfTransformer()
    vectorizer_model = CountVectorizer(stop_words="english")
    
    # Ensure UMAP uses a fixed random state for reproducibility
    umap_model = umap.UMAP(random_state=random_state)
    
    topic_model = BERTopic(
        representation_model=representation_model, 
        vectorizer_model=vectorizer_model,
        verbose=True,
        ctfidf_model=ctfidf_model,
        umap_model=umap_model,  # Pass UMAP model with random_state
        calculate_probabilities=True
    )
    
    topics, probs = topic_model.fit_transform(comments)
    return topic_model




##### Implementing it on stance --> flexibilities 

In [9]:
topic_model=implement_bertopic(df_flexibilities)
topic_model.get_topic_info()


2024-10-24 12:11:13,746 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2024-10-24 12:11:15,812 - BERTopic - Embedding - Completed ✓
2024-10-24 12:11:15,814 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-24 12:11:21,032 - BERTopic - Dimensionality - Completed ✓
2024-10-24 12:11:21,033 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-24 12:11:21,047 - BERTopic - Cluster - Completed ✓
2024-10-24 12:11:21,049 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-24 12:11:21,213 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,169,-1_students_rule_need_like,"[students, rule, need, like, nutrition, lunch,...",[The California School Nutrition Association (...
1,0,196,0_flavored_nutrition_flexibilities_target,"[flavored, nutrition, flexibilities, target, l...",[I appreciate the opportunity to submit commen...
2,1,44,1_lunches_new_like_healthy,"[lunches, new, like, healthy, rules, people, a...",[I think that the new school lunch rule is gre...
3,2,32,2_meals_flexibilities_provide_offer,"[meals, flexibilities, provide, offer, distric...",[School nutrition professionals depend on the ...
4,3,23,3_sodium_target_good_children,"[sodium, target, good, children, items, need, ...",[I have been a school nutrition professional f...
5,4,20,4_meals_students_years_dont,"[meals, students, years, dont, going, grain, d...",[As a School District Nutrition Services emplo...
6,5,18,5_kids_foods_idea_different,"[kids, foods, idea, different, machines, intol...",[This is a good idea because most kids or youn...


In [10]:
topic_model.merge_topics(df_flexibilities["comment"], [0,3])
topics_flexibilities=topic_model.get_document_info(df_flexibilities["comment"])
topics_flexibilities=pd.concat([topics_flexibilities, df_flexibilities], axis=1)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,169,-1_students_rule_need_like,"[students, rule, need, like, nutrition, lunch,...",[Executive Summary The implementation of this...
1,0,219,0_flavored_nutrition_target_flexibilities,"[flavored, nutrition, target, flexibilities, s...",[I appreciate the opportunity to submit commen...
2,1,44,1_lunches_new_like_healthy,"[lunches, new, like, healthy, rules, people, a...",[I think that the new school lunch rule is gre...
3,2,32,2_meals_flexibilities_provide_offer,"[meals, flexibilities, provide, offer, distric...",[School nutrition professionals depend on the ...
4,3,20,3_meals_students_like_years,"[meals, students, like, years, summer, dont, g...",[The school lunches that we are currently prov...
5,4,18,4_kids_foods_idea_different,"[kids, foods, idea, different, machines, intol...",[This is a good idea because most kids or youn...


In [11]:
set(topics_flexibilities["Topic"])

{-1, 0, 1, 2, 3, 4}

##### Implementing it on stance --> HHFKAs 

In [12]:
topic_model=implement_bertopic(df_HHFKAs)
topic_model.get_topic_info()


2024-10-24 12:11:21,510 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

2024-10-24 12:11:23,360 - BERTopic - Embedding - Completed ✓
2024-10-24 12:11:23,360 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-24 12:11:25,963 - BERTopic - Dimensionality - Completed ✓
2024-10-24 12:11:25,964 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-24 12:11:25,996 - BERTopic - Cluster - Completed ✓
2024-10-24 12:11:25,998 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-24 12:11:26,198 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,88,-1_children_standards_nutrition_usda,"[children, standards, nutrition, usda, percent...",[On behalf of the American Public Health Assoc...
1,0,746,0_children_nutrition_need_lunch,"[children, nutrition, need, lunch, quality, he...",[I am writing to express my disappointment wit...
2,1,78,1_america_salud_review_insecure,"[america, salud, review, insecure, nutritional...","[For the health of all children, I urge the U..."
3,2,66,2_schools_promoting_flexibility_contradiction,"[schools, promoting, flexibility, contradictio...","[For the health of all children, I urge the US..."
4,3,47,3_nutrition_programs_lowfat_usda,"[nutrition, programs, lowfat, usda, percent, c...","[January 16, 2018School Programs BranchPolicy ..."
5,4,27,4_disparities_evidencebased_fns20200038_docket,"[disparities, evidencebased, fns20200038, dock...",[Ms. Namian:I am disappointed with USDA&#39;s ...
6,5,24,5_nutrition_evidencebased_usda_disparities,"[nutrition, evidencebased, usda, disparities, ...",[Ms. Namian:I am disappointed with USDA&#39;s ...
7,6,20,6_standards_million_meals_lowincome,"[standards, million, meals, lowincome, ensure,...",[The healthier school meal rules have been an ...
8,7,15,7_lowfat_flavored_children_usda,"[lowfat, flavored, children, usda, nutrition, ...",[We respectfully submit comments in response t...
9,8,10,8_obesity_schools_hhfka_limits,"[obesity, schools, hhfka, limits, tripled, 200...",[USDA Secretary Sonny Perdue moved to make sch...


In [13]:
topic_model.merge_topics(df_HHFKAs["comment"], [4,5])
topics_HHFKA=topic_model.get_document_info(df_HHFKAs["comment"])
topics_HHFKA=pd.concat([topics_HHFKA, df_HHFKAs], axis=1)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,88,-1_children_standards_nutrition_usda,"[children, standards, nutrition, usda, percent...",[On behalf of the American Public Health Assoc...
1,0,746,0_children_nutrition_need_lunch,"[children, nutrition, need, lunch, quality, he...",[I am writing to express my disappointment wit...
2,1,78,1_latino_salud_review_insecure,"[latino, salud, review, insecure, nutritional,...","[For the health of all children, I urge the U..."
3,2,66,2_schools_promoting_flexibility_contradiction,"[schools, promoting, flexibility, contradictio...","[For the health of all children, I urge the US..."
4,3,51,3_meals_disparities_usda_ms,"[meals, disparities, usda, ms, sodium, docket,...",[Ms. Namian:I am extremely disappointed with U...
5,4,47,4_nutrition_flavored_usda_percent,"[nutrition, flavored, usda, percent, reduction...","[January 16, 2018School Programs BranchPolicy ..."
6,5,20,5_standards_million_meals_lowincome,"[standards, million, meals, lowincome, ensure,...",[The healthier school meal rules have been an ...
7,6,15,6_reduction_flavored_target_nutrition,"[reduction, flavored, target, nutrition, usda,...",[We respectfully submit this letter in respons...
8,7,10,7_obesity_hhfka_policies_tripled,"[obesity, hhfka, policies, tripled, kids, usda...",[USDA Secretary Sonny Perdue moved to make sch...


#### Saving data

In [14]:
topics_flexibilities.to_csv("data/interim_data/topics_flexibilities_v1.csv", index=False)
topics_HHFKA.to_csv("data/interim_data/topics_HHFKA_v1.csv", index=False)