In [1]:
import os
import sys

import joblib
from concurrent.futures import ThreadPoolExecutor

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "_0_Constants_and_Utils"))


from category_utils import normalise_text
from database_utils import (
    form_connection_params,
    get_dataframe_from_query,
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chekm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chekm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chekm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
local = True
connection_params = form_connection_params(local, True)

In [3]:
SVM_MODEL = joblib.load("svm_model.joblib")
VECTORIZER = joblib.load("tfidf_vectorizer.joblib")
ENCODER = joblib.load("label_encoder.joblib")

def predict_categories(batch):
    tweet_vectors = VECTORIZER.transform(batch).toarray()
    predicted_labels = SVM_MODEL.predict(tweet_vectors)
    categories = ENCODER.inverse_transform(predicted_labels)
    return categories.tolist()

def get_category(df, text_column, batch_size=128, max_workers=4):
    texts = df[text_column].tolist()
    labels = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(predict_categories, texts[i:i + batch_size]) for i in range(0, len(texts), batch_size)]
        for future in futures:
            batch_labels = future.result()
            labels.extend(batch_labels)

    df["category"] = labels
    return df

In [4]:
conversations_query = """
SELECT *
FROM Tweets
WHERE (full_text LIKE '%@Lufthansa%' OR full_text LIKE '%#Lufthansa%')
  AND tweet_id NOT IN (SELECT tweet_id FROM Conversations)
  AND user_id != 124476322;
"""

df_no_conversation = get_dataframe_from_query(conversations_query, connection_params, local, index_col="tweet_id")

In [5]:
df_no_conversation["full_text"] = df_no_conversation["full_text"].apply(normalise_text)

In [6]:
df_with_categories = get_category(df_no_conversation, "full_text", 256, 10)

In [7]:
to_save = df_with_categories.drop(["full_text", "quoted_status_id", "replied_tweet_id"], axis=1)
to_save.to_csv("no_conversation.csv")
to_save.to_csv(r"..\_8_PerformanceEvaluation\no_conversation.csv")
to_save.to_csv(r"..\_9_Demo\no_conversation.csv")