# Imports

In [2]:
import os
import sqlite3
import pandas as pd
import tensorflow as tf
from transformers import pipeline, XLMRobertaTokenizer, AutoTokenizer, TFAutoModelForSequenceClassification
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from collections import defaultdict
from typing import List, Tuple
from scipy.special import softmax
from sqlalchemy import create_engine
import re
import gc

2024-06-10 08:07:54.942564: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 08:07:54.942688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 08:07:55.090637: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Constants

In [3]:
# USER, DATABASE = "nezox2um_test", "nezox2um_test"
QUERY_ALL = """
SELECT T.tweet_id, T.full_text
FROM Tweets T
INNER JOIN Conversations C
ON T.tweet_id = C.first_tweet_id
WHERE C.category IS NULL;
"""
#QUERY_ALL = """
#SELECT c.conversation_id, c.tweet_id, t.full_text
#FROM Conversations c
#INNER JOIN Tweets t ON c.tweet_id = t.tweet_id
#INNER JOIN ConversationsCategory cc ON c.conversation_id = cc.conversation_id
#WHERE c.tweet_order = 1 AND cc.category = 'No category';
#"""

DTYPES = {
"tweet_id": "object",
"full_text": "object",
}

COMPANY_NAME_TO_ID = {
    "Klm": "56377143",
    "Air France": "106062176",
    "British Airways": "18332190",
    "American Air": "22536055",
    "Lufthansa": "124476322",
    "Air Berlin": "26223583",
    "Air Berlin assist": "2182373406",
    "easyJet": "38676903",
    "Ryanair": "1542862735",
    "Singapore Airlines": "253340062",
    "Qantas": "218730857",
    "Etihad Airways": "45621423",
    "Virgin Atlantic": "20626359",
}

COMPANY_ID_TO_NAME = {v: k for k, v in COMPANY_NAME_TO_ID.items()}

# Helper functions

In [3]:
def get_local_data(query: str, path: str) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        return pd.read_sql_query(query, connection,
                                   dtype=DTYPES,
                                   index_col='tweet_id')


# Loading

In [None]:
!cp "/kaggle/input/pre-cat/local_backup.db" "/kaggle/working/"

In [4]:
path =  "/kaggle/working/local_backup.db"

test_data = get_local_data(QUERY_ALL, path)

In [5]:
test_data

Unnamed: 0_level_0,full_text
tweet_id,Unnamed: 1_level_1
1244693824519184392,We in #Houston said goodbye to @KLM’s #QueenOf...
1244550514970329088,@Grenzmauer75 @elliotday @easyJet Exactly. Do ...
1243532085131743232,@FlySWISS @British_Airways @qatarairways Thank...
1244683000195022855,@RosamariaP3 Hola Rosa ✌ Siento que aún no hay...
1244691623289724928,@xdarc79 @hoetschenreuter @flavioArCab @Chapux...
...,...
491999025974218752,@Ryanair What if I make it into a Turban then?
452656989685178368,@AmericanAir Please help me!! I've fallen on ...
451124070730719233,@AmericanAir i was kidding thanks for the foll...
430790355962052608,"@AmericanAir phew, they finally turned on the ..."


In [6]:
# Airlines list
airlines = [
    'KLM', "British_Airways", "airfrance", "AmericanAir", "lufthansa", 
    "airberlinAssist", "easyJet", "Ryanair", "SingaporeAir", "Qantas",
    "EtihadAirways", "VirginAtlantic", "airberlin"
]

# Function to clean mentions not in the airlines list
def clean_mentions(text):
    # Regex pattern to find mentions
    mention_pattern = r'@([A-Za-z0-9_]+)'
    # Regex pattern to find URLs
    url_pattern = r'https?://\S+|www\.\S+'
    rt_pattern = r'^RT\s+'
    
    # Substitute mentions with an empty string
    text = re.sub(mention_pattern, '', text)
    # Substitute URLs with an empty string
    text = re.sub(url_pattern, '', text)
    text = re.sub(rt_pattern, "", text)
    
    return text.strip()

# Apply the cleaning function to the DataFrame
test_data['cleaned_text'] = test_data['full_text'].apply(clean_mentions)

In [7]:
test_data

Unnamed: 0_level_0,full_text,cleaned_text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1244693824519184392,We in #Houston said goodbye to @KLM’s #QueenOf...,We in #Houston said goodbye to ’s #QueenOfTheS...
1244550514970329088,@Grenzmauer75 @elliotday @easyJet Exactly. Do ...,"Exactly. Do they plan to go bankrupt, pay thei..."
1243532085131743232,@FlySWISS @British_Airways @qatarairways Thank...,Thanks all for still flying. We are on our way...
1244683000195022855,@RosamariaP3 Hola Rosa ✌ Siento que aún no hay...,Hola Rosa ✌ Siento que aún no hayas recibido e...
1244691623289724928,@xdarc79 @hoetschenreuter @flavioArCab @Chapux...,Tal cual y mucho de ellos ya en plena pandemia...
...,...,...
491999025974218752,@Ryanair What if I make it into a Turban then?,What if I make it into a Turban then?
452656989685178368,@AmericanAir Please help me!! I've fallen on ...,Please help me!! I've fallen on one of your p...
451124070730719233,@AmericanAir i was kidding thanks for the foll...,i was kidding thanks for the follow tho
430790355962052608,"@AmericanAir phew, they finally turned on the ...","phew, they finally turned on the air... Wait, ..."


# Categorization

### Prepare the data for training and testing

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Load the data
data = pd.read_csv('/kaggle/input/data-with-labels/translated_and_categorized_cleaned - Sheet1.csv')
data = data[150:]
# Step 2: Preprocess the text data
data['text'] = data['text'].str.lower()

# Step 3: Convert text data to numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text']).toarray()
y = data['Category']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Step 6: Create and train the SVM model with a linear kernel
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train_encoded)

# Step 7: Evaluate the model performance on the test set
y_pred = svm_model.predict(X_test)

# Ensuring the target names match the unique classes in the training set
unique_classes = label_encoder.classes_
print(classification_report(y_test_encoded, y_pred, labels=range(len(unique_classes)), target_names=unique_classes))

def predict_category(tweet, vectorizer, model, label_encoder):
    tweet_processed = tweet.lower()
    tweet_vectorized = vectorizer.transform([tweet_processed]).toarray()
    predicted_label = model.predict(tweet_vectorized)
    category = label_encoder.inverse_transform(predicted_label)
    return category[0]


                                 precision    recall  f1-score   support

                 baggage issues       1.00      0.25      0.40         4
               booking problems       0.67      0.20      0.31        10
              check-in troubles       0.00      0.00      0.00         1
    customer service complaints       1.00      0.20      0.33         5
flight delays and cancellations       1.00      0.71      0.83         7
    flight information requests       1.00      0.20      0.33         5
   food and beverage complaints       0.00      0.00      0.00         0
           in-flight experience       0.55      0.98      0.71        47
                   lost luggage       1.00      0.25      0.40         4
     promotion and offer issues       1.00      0.57      0.73         7
              refund complaints       1.00      0.67      0.80         3
   safety and security concerns       0.00      0.00      0.00         0
seating and boarding challenges       0.75      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
df = pd.read_csv("/kaggle/input/data-with-labels/translated_and_categorized_cleaned - Sheet1.csv")
test = df.head(100)
test['our_guess'] = test['text'].apply(lambda x: predict_category(x, tfidf_vectorizer, svm_model, label_encoder))
accuracy = (test['Category'] == test['our_guess']).mean()*100


print(test)
print(f"Accuracy after fine-tuning: {accuracy:.2f}%")

                                                 text  \
0   RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...   
1   Thanks @British_Airways I really needed the ex...   
2   So @AmericanAir @EWRairport lied to an old lad...   
3   And had to pay extra £50 because bag wouldnt f...   
4   @AmericanAir, I should be on a flight to LA, i...   
..                                                ...   
95  @SingaporeAir Similar to what Emirates offers ...   
96  5* Lufthansa SALE: Cheap flights from Germany ...   
97  RT @asemota: Why are the @British_Airways plan...   
98  RT @AllDayIDreamOf: .@OonaDahl Dahl and Newman...   
99  Not the first time I'm seeing this complaint. ...   

                                          translation  \
0   RT @sandeeprrao1991: BREAKING:-\nKLM to fly 3x...   
1   Thanks @British_Airways I really needed the ex...   
2   So @AmericanAir @EWRairport lied to an old lad...   
3   And had to pay extra £50 because bag wouldnt f...   
4   @AmericanAir, I should be 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['our_guess'] = test['text'].apply(lambda x: predict_category(x, tfidf_vectorizer, svm_model, label_encoder))


### Fine-tuning the model

In [79]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import numpy as np

df_train = df.head[100:].to_dict(orient='records')

model_name = 'MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(candidate_labels), 
    ignore_mismatched_sizes=True
)

for layer in model.layers:
    layer.trainable = True

texts = [item['text'] for item in df_train]
tokenized_inputs = tokenizer(texts, padding=True, truncation=True, max_length = 512 , return_tensors="tf")

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform([item['Category'] for item in df_train])
labels = tf.convert_to_tensor(labels, dtype=tf.int32)


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

history = model.fit(
    {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask']},
    labels,
    epochs=5,
    batch_size=16,
    validation_split=0.1,  # Add validation split to monitor overfitting
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)]
)

model.save_pretrained("/kaggle/working/fine_tuned_classification")
tokenizer.save_pretrained("/kaggle/working/fine_tuned_tokenizer")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaForSequenceClassification: ['classifier.out_proj.weight', 'roberta.embeddings.position_ids', 'classifier.out_proj.bias']
- This IS expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.
Some wei

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


('/kaggle/working/fine_tuned_tokenizer/tokenizer_config.json',
 '/kaggle/working/fine_tuned_tokenizer/special_tokens_map.json',
 '/kaggle/working/fine_tuned_tokenizer/sentencepiece.bpe.model',
 '/kaggle/working/fine_tuned_tokenizer/added_tokens.json',
 '/kaggle/working/fine_tuned_tokenizer/tokenizer.json')

In [24]:
from transformers import pipeline
import pandas as pd

# Define the candidate labels
candidate_labels = [
    "flight delays and cancellations",
    "booking problems",
    "check-in troubles",
    "customer service complaints",
    "seating and boarding challenges",
    "in-flight experience",
    "flight information requests",
    "refund complaints",
    "frequent flyer concerns",
    "safety and security concerns",
    "special assistance requests",
    "food and beverage complaints",
    "overbooking complaints",
    "technical difficulties",
    "promotion and offer issues",
    "lost luggage",
    "baggage issues"
]

# Load your pre-trained model and tokenizer using the pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli")

# Assuming 'df' is your DataFrame containing test data
df_test = df.head(100).to_dict(orient='records')

# Evaluate on test data
test_texts = [item['text'] for item in df_test]
test_labels = [item['Category'] for item in df_test]

# Perform predictions
predictions = classifier(test_texts, candidate_labels=candidate_labels, multi_label=False)

# Extract predicted labels
predicted_labels = [pred['labels'][0] for pred in predictions]

# Encode true and predicted labels to indices
label_encoder = LabelEncoder()
label_encoder.fit(candidate_labels)
test_labels_encoded = label_encoder.transform(test_labels)
predicted_labels_encoded = label_encoder.transform(predicted_labels)

# Calculate accuracy
accuracy = (predicted_labels_encoded == test_labels_encoded).mean() * 100
print(f"Accuracy before fine-tuning: {accuracy:.2f}%")


Accuracy before fine-tuning: 19.00%


In [25]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import pandas as pd

candidate_labels = [
    "flight delays and cancellations",
    "booking problems",
    "check-in troubles",
    "customer service complaints",
    "seating and boarding challenges",
    "in-flight experience",
    "flight information requests",
    "refund complaints",
    "frequent flyer concerns",
    "safety and security concerns",
    "special assistance requests",
    "food and beverage complaints",
    "overbooking complaints",
    "technical difficulties",
    "promotion and offer issues",
    "lost luggage",
    "baggage issues"
]

# Load your fine-tuned model and tokenizer
model_name = '/kaggle/working/fine_tuned_classification'
tokenizer_name = '/kaggle/working/fine_tuned_tokenizer'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(candidate_labels), 
    ignore_mismatched_sizes=True
)

# Assuming 'df' is your DataFrame containing test data
df_test = df.to_dict(orient='records')
df_test = df_test[:100]

# Evaluate on test data
test_texts = [item['text'] for item in df_test]
tokenized_test_inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=512, return_tensors="tf")

test_labels = label_encoder.transform([item['Category'] for item in df_test])
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.int32)

# Perform predictions
predictions = model.predict({'input_ids': tokenized_test_inputs['input_ids'], 'attention_mask': tokenized_test_inputs['attention_mask']})
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Calculate accuracy
accuracy = (predicted_labels == test_labels.numpy()).mean()*100
print(f"Accuracy after fine-tuning: {accuracy:.2f}%")

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at /kaggle/working/fine_tuned_classification.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


Accuracy after fine-tuning: 0.00%


In [None]:
model_name = '/kaggle/working/fine_tuned_classification'
tokenizer_name = '/kaggle/working/fine_tuned_tokenizer'

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy()

# Initialize tokenizer and pipeline
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

print(f"Maximum sequence length for the tokenizer: {tokenizer.model_max_length}")

device = 0 if tf.config.experimental.list_physical_devices('GPU') else -1
print(device)
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli", device=device)

#USE THIS WHEN MODEL IS NORMALLY FINE-TUNED
#classifier = pipeline("zero-shot-classification", model= model, tokenizer = tokenizer, device=device)

candidate_labels = [
    "flight delays and cancellations",
    "booking problems",
    "check-in troubles",
    "customer service complaints",
    "seating and boarding challenges",
    "in-flight experience",
    "flight information requests",
    "refund complaints",
    "frequent flyer concerns",
    "safety and security concerns",
    "special assistance requests",
    "food and beverage complaints",
    "overbooking complaints",
    "technical difficulties",
    "promotion and offer issues",
    "lost luggage",
    "baggage issues"
]

def classify_batch_conversations(conversations):
    if not conversations:  # Ensure there are conversations to process
        return [], []
    results = classifier(conversations, candidate_labels)
    labels = [result['labels'][0] for result in results]
    scores = [result['scores'][0] for result in results]
    return labels, scores

def update_text_local(batch: List[Tuple[str, str]], db_path: str) -> None:
    connection = connect_to_local_database(db_path)
    if connection is None:
        return
    try:
        cursor = connection.cursor()
        update_query = "UPDATE Conversations SET category = ? WHERE first_tweet_id = ?"
        cursor.executemany(update_query, batch)
        connection.commit()
    except sqlite3.Error as e:
        print(f"Error updating batch: {e}")
    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()

def process_batch(batch: List[str]):
    if not batch:  # Ensure there are texts to process
        return []
    labels, scores = classify_batch_conversations(batch)
    return labels, scores

def clear_gpu_memory():
    tf.keras.backend.clear_session()
    try:
        tf.compat.v1.reset_default_graph()
    except AttributeError:
        pass
    gc.collect()

def apply_sentiment_analysis(df, text_column, batch_size=128, max_workers=4):
    texts = df[text_column].tolist()
    labels = []
    scores = []    

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            if batch:  # Ensure the batch is not empty
                batch_labels, batch_scores = executor.submit(process_batch, batch).result()
                labels.extend(batch_labels)
                scores.extend(batch_scores)
                clear_gpu_memory()
    df["category"] = labels
    df["confidence"] = scores
    return df

def get_batches(df: pd.DataFrame, batch_size: int = 1000) -> List[pd.DataFrame]:
    return [df.iloc[i : i + batch_size] for i in range(0, len(df), batch_size)]

def convert_to_list(df: pd.DataFrame) -> List[List]:
    tweet_ids = df.index.to_numpy()
    sentiments = df["category"].to_numpy()
    return list(zip(sentiments, tweet_ids))

def connect_to_local_database(db_path: str):
    try:
        return sqlite3.connect(db_path)
    except sqlite3.Error as e:
        print(f"Error while connecting to SQLite: {e}")
    return None


In [13]:
test_data_copy = test_data.copy()
test_data_copy = test_data_copy.head(10)

In [14]:
for_batches = test_data_copy[test_data_copy['cleaned_text'].apply(len) >= 1]
data_batches = get_batches(for_batches[["cleaned_text"]], 1000)

In [None]:
# for_batches = test_data[test_data['cleaned_text'].apply(len) >= 1]
# data_batches = get_batches(for_batches[["cleaned_text"]], 1000)

In [17]:
for batch in tqdm(data_batches, desc="Updating text: "):
    df_sentiment = apply_sentiment_analysis(batch, "cleaned_text", 256, 2)
    update_text_local(convert_to_list(df_sentiment), path)

Updating text: 100%|██████████| 1/1 [00:06<00:00,  6.18s/it]


In [None]:
df_sentiment