In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
processed_data_path = '../data/processed/'
file_format_tmt = processed_data_path + "{split}_r3_{target}_top_mentioned_timelines_processed.csv"
file_format_users = processed_data_path + 'r3_{target}_{split}_users_processed.csv' 

In [3]:
target_list = [
    'ig',
    'bo', 
    'cl', 
    'co', 
    'gl', 
    'lu'
]

In [4]:
# ---------------------------------------------------
# 1. Define or load your Brazilian Portuguese stopwords
# ---------------------------------------------------
# Option A: Use nltk's 'portuguese' stopwords (they are not perfect for Brazilian Portuguese,
#           but it's a common starting point). You could also create your own custom list.

# Downloaded 'portuguese' stopwords
portuguese_stopwords = stopwords.words('portuguese')

# ---------------------------------------------------
# 2. Vectorize using TF-IDF
# ---------------------------------------------------

tfidf = TfidfVectorizer(
    stop_words=portuguese_stopwords,  # remove Brazilian Portuguese stopwords
    lowercase=True
)

In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from joblib import Parallel, delayed

# ---------------------------------------------------
# 1. Download and define Brazilian Portuguese stopwords
# ---------------------------------------------------
nltk.download('stopwords')
portuguese_stopwords = stopwords.words('portuguese')

# ---------------------------------------------------
# 2. Define your paths and targets
# ---------------------------------------------------
processed_data_path = '../data/processed/'
file_format_tmt = processed_data_path + "{split}_r3_{target}_top_mentioned_timelines_processed.csv"
file_format_users = processed_data_path + 'r3_{target}_{split}_users_processed.csv'

target_list = ['ig', 'bo', 'cl', 'co', 'gl', 'lu']

# ---------------------------------------------------
# 3. Utility function to load CSV data
# ---------------------------------------------------
def load_data(file_path, index_col='User_ID'):
    """
    Loads CSV data into a pandas DataFrame, drops the 'Unnamed: 0' column,
    and returns the DataFrame.
    """
    df = pd.read_csv(
        file_path,
        sep=';',
        encoding='utf-8-sig',
        index_col=index_col
    ).drop('Unnamed: 0', axis=1)
    return df

# ---------------------------------------------------
# 4. Function to perform feature selection with different score_funcs
# ---------------------------------------------------
def get_top_features(df, text_col, label_col, k=20, score_func=chi2):
    """
    Fits a TF-IDF vectorizer, applies a SelectKBest feature selection with the
    given score_func, and returns the top k selected features.
    """
    tfidf = TfidfVectorizer(
        stop_words=portuguese_stopwords,
        lowercase=True
    )
    X = tfidf.fit_transform(df[text_col])
    y = df[label_col]

    selector = SelectKBest(score_func=score_func, k=k)
    selector.fit_transform(X, y)

    feature_names = tfidf.get_feature_names_out()
    selected_mask = selector.get_support()
    selected_features = [
        feature for feature, is_selected
        in zip(feature_names, selected_mask)
        if is_selected
    ]
    return selected_features

# ---------------------------------------------------
# 5. Compare multiple score_funcs side by side
# ---------------------------------------------------
def compare_score_funcs(df, text_col, label_col, k=20, score_funcs=None):
    """
    Given a DataFrame and a list/dict of score_funcs, 
    returns a DataFrame comparing top k features side by side.
    """
    if score_funcs is None:
        # Default to some commonly used score_funcs
        score_funcs = {
            "chi2": chi2,
            "f_classif": f_classif,
            "mutual_info": mutual_info_classif
        }

    # For each score_func, get the top k features
    selected_features_by_score_func = {}
    for name, func in score_funcs.items():
        feats = get_top_features(
            df=df, 
            text_col=text_col, 
            label_col=label_col, 
            k=k, 
            score_func=func
        )
        selected_features_by_score_func[name] = feats

    # Build a comparison DataFrame: rank vs. features for each score_func
    comparison_df = pd.DataFrame({"Rank": range(1, k+1)})
    for score_func_name, features_list in selected_features_by_score_func.items():
        comparison_df[score_func_name] = features_list

    return comparison_df

# ---------------------------------------------------
# 6. Process one target, returning comparison DataFrames
# ---------------------------------------------------
def process_target(target, score_funcs=None):
    """
    Loads data for a single target, gets comparison dataframes for 
    TMT, Timeline, and Stance, and returns them in a dictionary.
    """
    # --- TMT data ---
    df_tmt = load_data(file_format_tmt.format(target=target, split="train"))
    tmt_compare_df = compare_score_funcs(
        df=df_tmt, text_col='Texts', label_col='Polarity', 
        k=20, score_funcs=score_funcs
    )

    # --- USERS data (Timeline) ---
    df_users = load_data(file_format_users.format(target=target, split="train"))
    timeline_compare_df = compare_score_funcs(
        df=df_users, text_col='Timeline', label_col='Polarity', 
        k=20, score_funcs=score_funcs
    )

    # --- USERS data (Stance) ---
    stance_compare_df = compare_score_funcs(
        df=df_users, text_col='Stance', label_col='Polarity', 
        k=20, score_funcs=score_funcs
    )

    results_dict = {
        "TMT": tmt_compare_df,
        "Timeline": timeline_compare_df,
        "Stance": stance_compare_df
    }
    return target, results_dict

# ---------------------------------------------------
# 7. Main function using parallelization
# ---------------------------------------------------
def main():
    # Define the score_funcs you'd like to compare
    score_funcs = {
        "chi2": chi2,
        "f_classif": f_classif,
        # "mutual_info": mutual_info_classif
    }

    outputs = Parallel(n_jobs=-1, backend="loky")(
        delayed(process_target)(target, score_funcs=score_funcs)
        for target in target_list
    )

    # Print or process results
    for target, compare_dfs in outputs:
        print(f"\n===============================")
        print(f" TARGET: {target}")
        print(f"===============================")

        for data_name, df_compare in compare_dfs.items():
            print(f"\n--- {data_name} ---\n")
            print(df_compare.to_string(index=False))


if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



 TARGET: ig

--- TMT ---

 Rank       chi2  f_classif
    1  bolsonaro       anos
    2     brasil       açaí
    3         cm     brasil
    4     contra brasileira
    5      covid        cmg
    6        cpi     contra
    7         cr    durante
    8        crl        kkk
    9    governo    maiores
   10        kkk       meio
   11       lula   pandemia
   12    milhões      parte
   13   ministro      praia
   14       oque  primeiros
   15   pandemia    público
   16   pazuello      sobre
   17 presidente        the
   18      sobre        vim
   19         sp        wpp
   20        stf    últimos

--- Timeline ---

 Rank       chi2 f_classif
    1       amor      além
    2         aq      amor
    3  bolsonaro      anos
    4     brasil     antes
    5         cm      deve
    6         cr  enquanto
    7   esquerda     feito
    8    governo   fizeram
    9        grt    imagem
   10        kkk       kkk
   11       lula      meio
   12         mn     menos
   13       paí