In [18]:
import sqlite3
import pandas as pd
import re


def get_tweets_with_category_sqlite(db_path):
    connection = sqlite3.connect(db_path)
    # query = """
    # SELECT 
    #     t.full_text, 
    #     cc.category
    # FROM 
    #     Tweets t
    # JOIN 
    #     Conversations c ON t.tweet_id = c.tweet_id
    # JOIN 
    #     ConversationsCategory cc ON c.conversation_id = cc.conversation_id
    # """
    query = """
    SELECT
        t.full_text,
        cc.category,
        cc.confidence
    FROM
        ConversationsCategory cc
    INNER JOIN
        Conversations c ON c.conversation_id = cc.conversation_id
    INNER JOIN
        Tweets t ON c.tweet_id = t.tweet_id
    WHERE 
        c.tweet_order = 1
    """

    df = pd.read_sql_query(query, connection)
    connection.close()
    return df

# Example usage
db_path = "./data_processed/local_backup_correct.db"
df_tweets = get_tweets_with_category_sqlite(db_path)
df_tweets.head()


Unnamed: 0,full_text,category,confidence
0,@nealrach @VirginAtlantic Siiiigh.... Still no...,technical difficulties,0.111812
1,@VirginAtlantic Sod off your primary sharehold...,booking problems,0.145168
2,@easyJet Please reply to my DM!,booking problems,0.152093
3,We’re waiving change fees for customers who ha...,baggage issues,0.184137
4,@katiewithani Please be assured if your flight...,flight delays and cancellations,0.19386


In [19]:
df_tweets

Unnamed: 0,full_text,category,confidence
0,@nealrach @VirginAtlantic Siiiigh.... Still no...,technical difficulties,0.111812
1,@VirginAtlantic Sod off your primary sharehold...,booking problems,0.145168
2,@easyJet Please reply to my DM!,booking problems,0.152093
3,We’re waiving change fees for customers who ha...,baggage issues,0.184137
4,@katiewithani Please be assured if your flight...,flight delays and cancellations,0.193860
...,...,...,...
493691,@airfrance j'ai mis une bombe dans un a avion.,lost luggage,0.156407
493692,@Ryanair What if I make it into a Turban then?,booking problems,0.150723
493693,@AmericanAir Please help me!! I've fallen on ...,check-in troubles,0.148291
493694,@AmericanAir i was kidding thanks for the foll...,check-in troubles,0.122705


In [20]:
def clean_mentions(text):
    mention_pattern = r"@([A-Za-z0-9_]+)"
    url_pattern = r"https?://\S+|www\.\S+"
    rt_pattern = r"^RT\s+"

    text = re.sub(mention_pattern, "", text)
    text = re.sub(url_pattern, "", text)
    text = re.sub(rt_pattern, "", text)

    return text.strip()

df_tweets["cleaned_text"] = df_tweets["full_text"].apply(clean_mentions)

In [21]:
df_tweets

Unnamed: 0,full_text,category,confidence,cleaned_text
0,@nealrach @VirginAtlantic Siiiigh.... Still no...,technical difficulties,0.111812,Siiiigh.... Still no idea when they're back as...
1,@VirginAtlantic Sod off your primary sharehold...,booking problems,0.145168,Sod off your primary shareholder SUED the NHS.
2,@easyJet Please reply to my DM!,booking problems,0.152093,Please reply to my DM!
3,We’re waiving change fees for customers who ha...,baggage issues,0.184137,We’re waiving change fees for customers who ha...
4,@katiewithani Please be assured if your flight...,flight delays and cancellations,0.193860,"Please be assured if your flight is cancelled,..."
...,...,...,...,...
493691,@airfrance j'ai mis une bombe dans un a avion.,lost luggage,0.156407,j'ai mis une bombe dans un a avion.
493692,@Ryanair What if I make it into a Turban then?,booking problems,0.150723,What if I make it into a Turban then?
493693,@AmericanAir Please help me!! I've fallen on ...,check-in troubles,0.148291,Please help me!! I've fallen on one of your p...
493694,@AmericanAir i was kidding thanks for the foll...,check-in troubles,0.122705,i was kidding thanks for the follow tho


In [25]:
def sample_categories(df, sample_size=100):
    # Ensure we have enough samples in each category
    categories = df['category'].unique()
    sampled_df_list = []

    for category in categories:
        category_df = df[df['category'] == category]
        if len(category_df) >= sample_size:
            sampled_df = category_df.sample(n=sample_size)
        else:
            # If there are not enough samples, use all available samples
            sampled_df = category_df
        sampled_df_list.append(sampled_df)

    return pd.concat(sampled_df_list, ignore_index=True)

# Assuming df_tweets is the DataFrame with tweet texts and their categories
df_sampled_tweets = sample_categories(df_tweets.query("confidence > 0.2"), sample_size=50)

In [27]:
df_sampled_tweets[["cleaned_text", "category"]].to_excel("balanced_for_labelling.xlsx", index=False)