# Imports

In [1]:
import os
import sys

import pandas as pd
from tqdm.notebook import tqdm
from typing import List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor


sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "_0_Constants_and_Utils"))


from database_utils import get_dataframe_from_query, form_connection_params, connect_to_database, execute_queries
from category_utils import clean_mentions, get_batches, convert_to_list


# Constants

In [2]:
QUERY_TEXT = """
SELECT 
    c.conversation_id, 
    t.full_text
FROM 
    Conversations AS c
JOIN 
    Tweets AS t 
ON 
    c.tweet_id = t.tweet_id
JOIN 
    ConversationsCategory AS cc
ON 
    c.conversation_id = cc.conversation_id
WHERE 
    c.tweet_order = 1
AND 
    cc.category = 'No Category';
"""

# Loading

In [3]:
# Set local = False if you want to query the online MySQL database
local = False
connection_params = form_connection_params(local, True)
connection_params["user"] = "nezox2um_test"
connection_params["database"] = "nezox2um_test"

In [4]:
test_data = get_dataframe_from_query(QUERY_TEXT, connection_params, local, index_col="conversation_id")
test_data

Unnamed: 0_level_0,full_text
conversation_id,Unnamed: 1_level_1
1,"Thailand, Nepal, Japan… Laat je verwonderen do..."
2,"Thailand, Nepal, Japan… Laat je verwonderen do..."
3,"Thailand, Nepal, Japan… Laat je verwonderen do..."
4,"Thailand, Nepal, Japan… Laat je verwonderen do..."
5,"Thailand, Nepal, Japan… Laat je verwonderen do..."
...,...
493694,@airfrance j'ai mis une bombe dans un a avion.
493695,@Ryanair What if I make it into a Turban then?
493696,@AmericanAir Please help me!! I've fallen on ...
493697,@AmericanAir i was kidding thanks for the foll...


In [5]:
test_data['cleaned_text'] = test_data['full_text'].apply(clean_mentions)

In [6]:
test_data

Unnamed: 0_level_0,full_text,cleaned_text
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Thailand, Nepal, Japan… Laat je verwonderen do...","Thailand, Nepal, Japan… Laat je verwonderen do..."
2,"Thailand, Nepal, Japan… Laat je verwonderen do...","Thailand, Nepal, Japan… Laat je verwonderen do..."
3,"Thailand, Nepal, Japan… Laat je verwonderen do...","Thailand, Nepal, Japan… Laat je verwonderen do..."
4,"Thailand, Nepal, Japan… Laat je verwonderen do...","Thailand, Nepal, Japan… Laat je verwonderen do..."
5,"Thailand, Nepal, Japan… Laat je verwonderen do...","Thailand, Nepal, Japan… Laat je verwonderen do..."
...,...,...
493694,@airfrance j'ai mis une bombe dans un a avion.,j'ai mis une bombe dans un a avion.
493695,@Ryanair What if I make it into a Turban then?,What if I make it into a Turban then?
493696,@AmericanAir Please help me!! I've fallen on ...,Please help me!! I've fallen on one of your p...
493697,@AmericanAir i was kidding thanks for the foll...,i was kidding thanks for the follow tho


# Categorization

### Prepare the data for training and testing

In [7]:


# Step 1: Load the data
data = pd.read_excel('translated_and_categorized_cleaned.xlsx')
data = data[200:]
# Step 2: Preprocess the text data
data['text'] = data['text'].str.lower()

# Step 3: Convert text data to numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text']).toarray()
y = data['Category']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Step 6: Create and train the SVM model with a linear kernel
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train_encoded)

# Step 7: Evaluate the model performance on the test set
y_pred = svm_model.predict(X_test)

# Ensuring the target names match the unique classes in the training set
unique_classes = label_encoder.classes_
print(classification_report(y_test_encoded, y_pred, labels=range(len(unique_classes)), target_names=unique_classes))

def predict_category(tweet, vectorizer, model, label_encoder):
    tweet_processed = tweet.lower()
    tweet_vectorized = vectorizer.transform([tweet_processed]).toarray()
    predicted_label = model.predict(tweet_vectorized)
    category = label_encoder.inverse_transform(predicted_label)
    return category[0]


                                 precision    recall  f1-score   support

                 baggage issues       1.00      0.71      0.83        14
               booking problems       0.94      0.74      0.83        42
              check-in troubles       0.00      0.00      0.00         2
    customer service complaints       0.87      0.77      0.82        26
flight delays and cancellations       0.96      0.84      0.90        51
    flight information requests       1.00      0.52      0.69        21
   food and beverage complaints       1.00      0.50      0.67         2
           in-flight experience       0.73      0.98      0.84       149
                   lost luggage       0.62      0.45      0.53        11
     promotion and offer issues       1.00      0.80      0.89         5
              refund complaints       1.00      0.67      0.80        12
   safety and security concerns       0.00      0.00      0.00         1
seating and boarding challenges       0.98      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
def predict_category(tweet, vectorizer, model, label_encoder):
    tweet_processed = tweet.lower()
    tweet_vectorized = vectorizer.transform([tweet_processed]).toarray()
    predicted_label = model.predict(tweet_vectorized)
    category = label_encoder.inverse_transform(predicted_label)
    return category[0]

In [9]:
df = pd.read_excel("translated_and_categorized_cleaned.xlsx")
test = df[:200].copy()
test['our_guess'] = test['text'].apply(lambda x: predict_category(x, tfidf_vectorizer, svm_model, label_encoder))
accuracy = (test['Category'] == test['our_guess']).mean()*100


print(f"Accuracy after fine-tuning: {accuracy:.2f}%")
test

Accuracy after fine-tuning: 95.29%


Unnamed: 0,text,translation,Category,our_guess
0,"Al hilo de la demostración de este fenómeno, m...","While demonstrating this phenomenon, a Ryanair...",in-flight experience,in-flight experience
1,@RachelGarvey1 Hi Rachel. As that price isn't ...,@RachelGarvey1 Hi Rachel. As that price isn't ...,booking problems,booking problems
2,RT @ameyaw112: Thank you @SingaporeAir for sho...,RT @ameyaw112: Thank you @SingaporeAir for sho...,in-flight experience,in-flight experience
3,RT @spotter_info: EJU35QV OE-IZC A320 easyJet...,RT @spotter_info: EJU35QV OE-IZC A320 easyJet...,flight information requests,flight information requests
4,RT @spotter_info: EZY71DN G-EZWD A320 easyJet...,RT @spotter_info: EZY71DN G-EZWD A320 easyJet...,flight information requests,flight information requests
...,...,...,...,...
1948,"@gillicious Hi Gillian, we're happy to hear th...","@gillicious Hi Gillian, we're happy to hear th...",booking problems,booking problems
1949,So I’m sick n tired of lazy ass @easyJet cabin...,So I’m sick n tired of lazy ass @easyJet cabin...,in-flight experience,in-flight experience
1950,AirFrance embarque les passagers par zone pour...,AirFrance boards passengers by zone to speed u...,seating and boarding challenges,seating and boarding challenges
1951,RT @DonnaKinna: So I’m sick n tired of lazy as...,RT @DonnaKinna: So I’m sick n tired of lazy as...,in-flight experience,in-flight experience


In [10]:
def process_batch(batch):
    return [
        predict_category(item, tfidf_vectorizer, svm_model, label_encoder)
        for item in batch
    ]

def get_category(df, text_column, batch_size=128, max_workers=4):
    texts = df[text_column].tolist()
    labels = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_batch, texts[i:i + batch_size]) for i in range(0, len(texts), batch_size)]
        for future in futures:
            batch_labels = future.result()
            labels.extend(batch_labels)

    df["category"] = labels
    return df


def update_categories(
    batch: List[Tuple[str, str]], connection_params: dict, local: bool
) -> None:
    """
    Update full_text values for a batch of data in the local SQLite database.

    Args:
        batch: List of (full_text, tweet_id) pairs.
        db_path: The path to the SQLite database file.
    """
    update_query = "UPDATE ConversationsCategory SET category = ? WHERE conversation_id = ?"
    if not local:
        update_query = update_query.replace("?", "%s")
    with connect_to_database(connection_params, local) as connection:
        execute_queries(connection, [(update_query, batch)])

In [13]:
data_batches = get_batches(test_data[["cleaned_text"]], 1_000)

In [14]:
for batch in tqdm(data_batches, desc="Updating categories: "):
    df_categories = get_category(batch, "cleaned_text", 256, 10)
    update_categories(convert_to_list(df_categories), connection_params, local)

Updating categories:   0%|          | 0/491 [00:00<?, ?it/s]

KeyboardInterrupt: 