In [1]:
import os
import sqlite3

from collections import defaultdict
from typing import List, Tuple

import mysql
import mysql.connector
import pandas as pd
from mysql.connector import Error
from tqdm import tqdm

In [2]:
class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, conversation):
        node = self.root
        for tweet_id in conversation:
            node = node.children[tweet_id]
        node.is_end = True

    def is_subset(self, conversation):
        node = self.root
        for tweet_id in conversation:
            if tweet_id not in node.children:
                return False
            node = node.children[tweet_id]
        return True

def trace_conversation(start_tweet_id: str, tweet_dict: dict):
    convo = []
    current_tweet_id = start_tweet_id
    users_in_conversation = set()
    local_processed_tweet_ids = set()  # Local set to track the current conversation
    while current_tweet_id:
        if current_tweet_id not in tweet_dict or current_tweet_id in local_processed_tweet_ids:
            break
        tweet_info = tweet_dict[current_tweet_id]
        convo.append(current_tweet_id)
        users_in_conversation.add(tweet_info['user_id'])
        local_processed_tweet_ids.add(current_tweet_id)
        if len(users_in_conversation) > 2:
            return convo[:-1][::-1]  # As soon as the third user appears, we delete his tweet and return
        current_tweet_id = tweet_info['replied_tweet_id']
    return convo[::-1] if len(users_in_conversation) == 2 else None

def extract_and_filter_conversations(df: pd.DataFrame):
    df = df.sort_values("tweet_creation_time", ascending=False)
    df.index = df.index.astype(str)
    tweet_dict = df.to_dict('index')
    conversations = []
    trie = Trie()  # Initialize trie for subset checks

    # Start tracing conversations from tweets that are replies
    for tweet_id in tqdm(df[df['replied_tweet_id'].notnull()].index, desc="Extracting all conversations"):
        if conversation := trace_conversation(tweet_id, tweet_dict):
            if not trie.is_subset(conversation):
                trie.insert(conversation)
                conversations.append(conversation)

    return conversations


def get_local_data(query: str, path: str, dtype: bool = True) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        if dtype:
            df = pd.read_sql_query(query, connection,
                                   dtype=DTYPES,
                                   index_col='tweet_id')
            df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
            df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
        else:
            df = pd.read_sql_query(query, connection)
    
    return df


# def fetch_data(query: str, dtype: bool = True) -> pd.DataFrame:
#     engine = create_engine(f"mysql://{USER}:{PASSWORD}@{HOST}:3306/{DATABASE}")
#     if dtype:
#         return pd.read_sql_query(query, engine,
#                                  dtype=DTYPES, index_col='tweet_id')
#     return pd.read_sql_query(query, engine)



In [3]:
def check_given_var(env_var_str: str) -> str:
    """
    Check if the given environment variable is set and return its value.

    Args:
        env_var_str (str): The name of the environment variable to check.

    Returns:
        str: The value of the environment variable.

    Raises:
        AssertionError: If the environment variable is not found.
    """

    env_var = os.getenv(env_var_str)
    assert (
        env_var is not None
    ), f"{env_var_str} is required but not found in environment variables"
    return env_var


def check_env_vars() -> (str, str, str, str):  # type: ignore
    user = check_given_var("DBL_USER")
    database = check_given_var("DBL_DATABASE")
    password = check_given_var("DBL_PASSWORD")
    host = check_given_var("DBL_HOST")
    return user, database, password, host


USER, DATABASE, PASSWORD, HOST = check_env_vars()
# USER, DATABASE = "nezox2um_test", "nezox2um_test"
QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id,
    Tweets.full_text,
    Tweets.lang,
    Tweets.replied_tweet_id
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""


DTYPES = {
"user_id": "object",
"tweet_id": "object",
"full_text": "object",
"lang": "category",
"replied_tweet_id": "object",
}

COMPANY_NAME_TO_ID = {
    "Klm": "56377143",
    "Air France": "106062176",
    "British Airways": "18332190",
    "American Air": "22536055",
    "Lufthansa": "124476322",
    "Air Berlin": "26223583",
    "Air Berlin assist": "2182373406",
    "easyJet": "38676903",
    "Ryanair": "1542862735",
    "Singapore Airlines": "253340062",
    "Qantas": "218730857",
    "Etihad Airways": "45621423",
    "Virgin Atlantic": "20626359",
}

COMPANY_ID_TO_NAME = {v: k for k, v in COMPANY_NAME_TO_ID.items()}

In [4]:
# Server
# test_data = fetch_data(QUERY_ALL)
# Local
path =  os.path.join(
        os.path.dirname(
            os.getcwd()
        ),
    "data_processed", "local_backup.db")

test_data = get_local_data(QUERY_ALL, path)

In [5]:
convo_special = test_data[["user_id", "replied_tweet_id", "tweet_creation_time"]]
convo_special

Unnamed: 0_level_0,user_id,replied_tweet_id,tweet_creation_time
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1131172858951024641,393374091,,2019-05-22 12:20:00+00:00
1130922003702177800,880417607865815040,1130615560910254080,2019-05-21 19:43:11+00:00
1131172864147808257,3420691215,,2019-05-22 12:20:01+00:00
1131172867985485824,394376606,1131032916232826881,2019-05-22 12:20:02+00:00
1131030279278063616,227687574,,2019-05-22 02:53:26+00:00
...,...,...,...
1244696703690772485,278698748,,2020-03-30 18:43:14+00:00
1244696708983984131,246520593,,2020-03-30 18:43:15+00:00
1244696710447800320,109284383,,2020-03-30 18:43:15+00:00
1244696713350217728,1223576386432126976,,2020-03-30 18:43:16+00:00


In [6]:
conversations = extract_and_filter_conversations(convo_special)

Extracting all conversations: 100%|██████████| 1795409/1795409 [00:08<00:00, 213357.09it/s]


In [7]:
data = []
for convo_num, convo in enumerate(conversations, start=1):
    data.extend((convo_num, tweet_id) for tweet_id in convo)
# Create a DataFrame
df_conversations = pd.DataFrame(data, columns=['Conversation', 'Tweet_ID'])

# Set MultiIndex
df_conversations

Unnamed: 0,Conversation,Tweet_ID
0,1,1244694453190897664
1,1,1244696682979303426
2,2,1244677304598609923
3,2,1244696641401163776
4,3,1244648694454026240
...,...,...
2712242,1064150,451125255294443521
2712243,1064151,430790355962052608
2712244,1064151,430792524043931648
2712245,1064152,248528541157834752


In [8]:

# Merge the conversation DataFrame with the test_data DataFrame
df_conversations_full = df_conversations.merge(test_data, left_on='Tweet_ID', right_index=True, how='left')

# Set the MultiIndex again with Conversation and Tweet_ID
df_conversations_full.set_index(['Conversation', 'Tweet_ID'], inplace=True)
df_conversations_full

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,user_creation_time,tweet_creation_time,full_text,lang,replied_tweet_id
Conversation,Tweet_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1244694453190897664,521835883,2012-03-12 01:11:22+00:00,2020-03-30 18:34:17+00:00,@nealrach @VirginAtlantic Siiiigh.... Still no...,en,1243885949697888263
1,1244696682979303426,20626359,2009-02-11 20:50:56+00:00,2020-03-30 18:43:09+00:00,@Jade_Velveteese Hi Jade. We have an ‘Away fro...,en,1244694453190897664
2,1244677304598609923,396021583,2011-10-22 16:35:05+00:00,2020-03-30 17:26:09+00:00,@VirginAtlantic Sod off your primary sharehold...,en,1244669964289806338
2,1244696641401163776,832964639436701696,2017-02-18 14:47:00+00:00,2020-03-30 18:42:59+00:00,"@Boyde11 @VirginAtlantic Get your facts right,...",en,1244677304598609923
3,1244648694454026240,1233410199500791809,2020-02-28 15:14:56+00:00,2020-03-30 15:32:27+00:00,@flavioArCab @Chapux0204 @chechiffss @aeronaut...,es,1244643427515535360
...,...,...,...,...,...,...,...
1064150,451125255294443521,22536055,2009-03-02 21:23:05+00:00,2014-04-01 22:33:37+00:00,@lanaupdates_ Your information has been forwar...,en,451124070730719233
1064151,430790355962052608,64327804,2009-08-10 03:34:27+00:00,2014-02-04 19:49:59+00:00,"@AmericanAir phew, they finally turned on the ...",en,
1064151,430792524043931648,22536055,2009-03-02 21:23:05+00:00,2014-02-04 19:58:36+00:00,@benjy_greenberg It looks like we'll have you ...,en,430790355962052608
1064152,248528541157834752,19911051,2009-02-02 15:17:02+00:00,2012-09-19 21:06:36+00:00,Un-fucking believable!\nThanks @BritishAirways...,en,


In [12]:
import nltk
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chekm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Chekm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [13]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

noun_counts = {}
chosen_categories = [
    # General Travel and Flight Related Issues
    'flight', 'flights', 'delay', 'delays', 'cancellation', 'cancelations', 'cancel', 'missed', 'connection', 
    'connections', 'boarding', 'gate', 'terminal', 'security', 'check', 'check-in', 'ticket', 'tickets', 'booking', 
    'book', 'itinerary', 'reschedule', 'upgrade', 'upgrades', 'overbooked', 'overbooking', 'seat', 'seats', 'assignment', 
    'assign', 'seating', 'boarding pass', 'baggage', 'luggage', 'lost luggage', 'damaged luggage', 'carry-on', 'checked bag', 
    'baggage claim', 'baggage policy', 'fees', 'cost', 'extra cost', 'payment', 'payment issue', 'refund', 'refunds', 
    'compensation', 'voucher', 'vouchers', 'policy', 'change', 'changes', 'insurance', 'restriction', 'restrictions', 
    'travel document', 'passport', 'visa', 'id', 'driver license', 'entry', 'exit', 'countries', 'border', 'immigration', 
    'customs', 'quarantine', 'health', 'safety', 'emergency', 'evacuation', 'weather', 'storm', 'turbulence', 'mechanical issue', 
    'maintenance', 'technical issue', 'failure', 'glitch', 'malfunction', 'wifi', 'internet', 'entertainment', 'inflight', 
    'movie', 'movies', 'music', 'magazine', 'newspaper', 'meal', 'snack', 'beverage', 'drinks', 'water', 'juice', 'alcohol', 
    'coffee', 'tea', 'soft drink', 'food', 'meal', 'special meal', 'vegetarian', 'vegan', 'kosher', 'halal', 'allergy', 
    'child meal', 'gluten-free', 'lactose-free', 'diabetic', 'service', 'crew', 'staff', 'attitude', 'behavior', 'courtesy', 
    'politeness', 'rude', 'unfriendly', 'help', 'assistance', 'communication', 'response', 'contact', 'call center', 'support', 
    'help desk', 'customer service', 'agent', 'representative', 'complaint', 'complaints', 'issue', 'issues', 'problem', 'problems', 
    'inquiry', 'inquiries', 'feedback', 'review', 'survey', 'rating', 'recommendation', 'suggestion', 'comment', 'experience', 
    'appreciated', 'apology', 'compensate', 'voucher', 'vouchers', 'upgrade', 'upgrades', 'mileage', 'miles', 'points', 'membership', 
    'tier', 'status', 'gold', 'silver', 'platinum', 'elite', 'frequent flyer', 'loyalty', 'program', 'amenity', 'amenities', 'seating', 
    'legroom', 'space', 'recline', 'comfort', 'uncomfortable', 'tight', 'cramped', 'cold', 'hot', 'temperature', 'ventilation', 
    'cleanliness', 'hygiene', 'sanitation', 'dirty', 'clean', 'restroom', 'bathroom', 'toilet', 'lavatory', 'smell', 'odor', 
    'hygiene product', 'soap', 'towel', 'napkin', 'blanket', 'pillow', 'headphones', 'earphones', 'charging', 'outlet', 'outlets', 
    'socket', 'sockets', 'lost and found', 'item', 'items', 'claim', 'lost', 'found', 'delayed', 'missing', 'stolen', 'damage', 
    'damaged', 'broken', 'compensation', 'refund', 'policy', 'restrictions', 'website', 'webpage', 'online', 'offline', 'app', 
    'mobile', 'device', 'error', 'bug', 'fix', 'technical issue', 'log in', 'sign in', 'password', 'username', 'error', 'malfunction'
]

def ReturnNouns(sentence: str):
    #nouns
    tokens = word_tokenize(sentence)
    tokens = [token for token in tokens if token.isalpha()]
    tagged = pos_tag(tokens)
    return [word for word, pos in tagged if pos in ['NN', 'NNS', 'NNP', 'NNPS']]

# Assuming df_conversations_full is your DataFrame
df_test = df_conversations_full
for index, row in df_test.iterrows():
    if row['lang'] == 'en':
        sentence = row['full_text']
        nouns = ReturnNouns(sentence)
        
        # Tokenize the sentence to get the words
        words_in_sentence = word_tokenize(sentence)
        
        # Check for overlapping elements
        overlap_exists = any(element in words_in_sentence for element in chosen_categories)
        
        for noun in nouns:
            if noun.lower() in chosen_categories:  # Only increment if noun is in chosen categories
                noun_counts.setdefault(noun.lower(), 0)
                noun_counts[noun.lower()] += 1

print(noun_counts)


{'flights': 172511, 'contact': 30650, 'countries': 4055, 'entry': 1882, 'communication': 6896, 'refund': 70008, 'vouchers': 7331, 'flight': 558262, 'change': 30110, 'fees': 10413, 'problems': 8868, 'response': 35817, 'booking': 46647, 'service': 101579, 'problem': 25005, 'status': 21981, 'book': 35219, 'website': 28097, 'inquiries': 303, 'refunds': 9735, 'inflight': 1003, 'crew': 41119, 'staff': 48342, 'voucher': 42696, 'ticket': 47564, 'payment': 5215, 'cancellation': 11223, 'baggage': 39276, 'seats': 41640, 'policy': 20969, 'representative': 1405, 'health': 5452, 'online': 18001, 'silver': 1750, 'membership': 2172, 'tier': 1140, 'review': 3202, 'assistance': 20342, 'agent': 17137, 'check': 17643, 'help': 39104, 'courtesy': 1968, 'magazine': 1029, 'cost': 10536, 'quarantine': 1178, 'mobile': 661, 'emergency': 7302, 'delay': 47703, 'tickets': 20540, 'complaints': 3637, 'error': 8048, 'changes': 12502, 'music': 2714, 'id': 3999, 'restrictions': 8215, 'claim': 28981, 'reschedule': 631, '

In [14]:
noun_counts

{'flights': 172511,
 'contact': 30650,
 'countries': 4055,
 'entry': 1882,
 'communication': 6896,
 'refund': 70008,
 'vouchers': 7331,
 'flight': 558262,
 'change': 30110,
 'fees': 10413,
 'problems': 8868,
 'response': 35817,
 'booking': 46647,
 'service': 101579,
 'problem': 25005,
 'status': 21981,
 'book': 35219,
 'website': 28097,
 'inquiries': 303,
 'refunds': 9735,
 'inflight': 1003,
 'crew': 41119,
 'staff': 48342,
 'voucher': 42696,
 'ticket': 47564,
 'payment': 5215,
 'cancellation': 11223,
 'baggage': 39276,
 'seats': 41640,
 'policy': 20969,
 'representative': 1405,
 'health': 5452,
 'online': 18001,
 'silver': 1750,
 'membership': 2172,
 'tier': 1140,
 'review': 3202,
 'assistance': 20342,
 'agent': 17137,
 'check': 17643,
 'help': 39104,
 'courtesy': 1968,
 'magazine': 1029,
 'cost': 10536,
 'quarantine': 1178,
 'mobile': 661,
 'emergency': 7302,
 'delay': 47703,
 'tickets': 20540,
 'complaints': 3637,
 'error': 8048,
 'changes': 12502,
 'music': 2714,
 'id': 3999,
 'res