# Imports

In [None]:
import contextlib
import os
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pycountry
import seaborn as sns
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from sqlalchemy import create_engine

# Constants

In [None]:
def check_given_var(env_var_str: str) -> str:
    """
    Check if the given environment variable is set and return its value.

    Args:
        env_var_str (str): The name of the environment variable to check.

    Returns:
        str: The value of the environment variable.

    Raises:
        AssertionError: If the environment variable is not found.
    """

    env_var = os.getenv(env_var_str)
    assert (
        env_var is not None
    ), f"{env_var_str} is required but not found in environment variables"
    return env_var


def check_env_vars() -> (str, str, str, str):  # type: ignore
    user = check_given_var("DBL_USER")
    database = check_given_var("DBL_DATABASE")
    password = check_given_var("DBL_PASSWORD")
    host = check_given_var("DBL_HOST")
    return user, database, password, host


USER, DATABASE, PASSWORD, HOST = check_env_vars()
# USER, DATABASE = "nezox2um_test", "nezox2um_test"

QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Users.verified,
    Users.followers_count,
    Users.friends_count,
    Users.statuses_count,
    Users.default_profile,
    Users.default_profile_image,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id,
    Tweets.full_text,
    Tweets.lang,
    Tweets.country_code,
    Tweets.favorite_count,
    Tweets.retweet_count,
    Tweets.possibly_sensitive,
    Tweets.replied_tweet_id,
    Tweets.reply_count,
    Tweets.quoted_status_id,
    Tweets.quote_count
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""

QUERY_REPLY = """
SELECT 
    t1.tweet_id AS tweet_id,
    t1.creation_time AS tweet_creation_time,
    t1.user_id AS user_id,
    t2.tweet_id AS original_tweet_id,
    t2.creation_time AS original_tweet_creation_time,
    t2.user_id AS original_user_id
FROM 
    Tweets t1
INNER JOIN 
    Tweets t2
ON 
    t1.replied_tweet_id = t2.tweet_id;
"""

DTYPES = {
"user_id": "object",
"user_creation_time": "datetime64[ns]",
"verified": "bool",
"followers_count": "int32",
"friends_count": "int32",
"statuses_count": "int32",
"default_profile": "bool",
"default_profile_image": "bool",
"tweet_creation_time": "datetime64[ns]",
"tweet_id": "object",
"full_text": "object",
"lang": "category",
"country_code": "category",
"favorite_count": "int32",
"retweet_count": "int32",
"possibly_sensitive": "bool",
"replied_tweet_id": "object",
"reply_count": "int32",
"quoted_status_id": "object",
"quote_count": "int32",
}

COMPANY_NAME_TO_ID = {
    "Klm": "56377143",
    "Air France": "106062176",
    "British Airways": "18332190",
    "American Air": "22536055",
    "Lufthansa": "124476322",
    "Air Berlin": "26223583",
    "Air Berlin assist": "2182373406",
    "easyJet": "38676903",
    "Ryanair": "1542862735",
    "Singapore Airlines": "253340062",
    "Qantas": "218730857",
    "Etihad Airways": "45621423",
    "Virgin Atlantic": "20626359",
}

COMPANY_ID_TO_NAME = {v: k for k, v in COMPANY_NAME_TO_ID.items()}

# Helper functions

In [None]:
def blob_to_datetime(blob: str) -> pd.Timestamp:
    return pd.to_datetime(blob)



def get_local_data(query: str, path: str, dtype: bool = True) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        if dtype:
            df = pd.read_sql_query(query, connection,
                                   dtype={k: v for k, v in DTYPES.items() if k not in ("tweet_creation_time", "user_creation_time")},
                                   index_col='tweet_id')
            df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
            df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
        else:
            df = pd.read_sql_query(query, connection)
    
    return df


def fetch_data(query: str, dtype: bool = True) -> pd.DataFrame:
    engine = create_engine(f"mysql://{USER}:{PASSWORD}@{HOST}:3306/{DATABASE}")
    if dtype:
        return pd.read_sql_query(query, engine,
                                 dtype=DTYPES, index_col='tweet_id')
    return pd.read_sql_query(query, engine)


def get_full_language_name(language_code: str,
                           default: str="Undefined Language") -> str:
    """
    Convert a two-letter language code (ISO 639-1) to its full language name.
    
    Parameters:
    language_code (str): The two-letter ISO 639-1 language code.
    
    Returns:
    str: The full name of the language or a message indicating the code was not found.
    """
    if language_code=="Other languages":
        return language_code
    language = pycountry.languages.get(alpha_2=language_code, default=default)
    if language != default:
        language = language.name
    return language


def get_country_name(country_code: str, default: str="Unknown Country") -> str:
    """
    Convert a two-letter country code (ISO 3166-1 alpha-2|) to its full country name.
    
    Parameters:
    country_code (str): The two-letter ISO 3166-1 alpha-2 country code.
    
    Returns:
    str: The full name of the country or a message indicating the code was not found.
    """
    country = pycountry.countries.get(alpha_2=country_code, default=default)
    if country != default:
        country = country.name
    return country

# Loading

In [None]:
# Server
# test_data = fetch_data(QUERY_ALL)
# df_reply = fetch_data(QUERY_REPLY, dtype=False).set_index("tweet_id")
# Local
path =  os.path.join(
    os.path.dirname(
        os.path.dirname(
            os.getcwd()
            )
        ),
    "data_processed", "local_backup.db")
path
test_data = get_local_data(QUERY_ALL, path)

In [None]:
avia_names = set(COMPANY_NAME_TO_ID.values())

replies_to_avia_companies_df = test_data.loc[test_data['user_id'].apply(
    lambda x: any(x == avia_name for avia_name in avia_names)
    )]
replies_to_avia_companies_df = replies_to_avia_companies_df.reset_index()\
    .groupby("user_id").count()[['tweet_id']]\
        .sort_values('tweet_id', ascending=False).reset_index()
replies_to_avia_companies_df["user_id"] = replies_to_avia_companies_df["user_id"]\
    .apply(lambda user_id: COMPANY_ID_TO_NAME.get(user_id, user_id))
replies_to_avia_companies_df = replies_to_avia_companies_df.set_index("user_id")
replies_to_avia_companies_df

## Companies' activity and popularity in social media

In [None]:
df_airlines_popularity = test_data[test_data['user_id']\
    .apply(lambda x: any(x == avia_name for avia_name in avia_names))]\
        .groupby("user_id")\
            .agg(
                retweet_count=("retweet_count", "sum"),
                favorite_count=("favorite_count", "sum"),
                reply_count=("reply_count", "sum"),
                quote_count=("quote_count", "sum"),
            )
df_airlines_popularity.index = df_airlines_popularity.index.map(
    lambda user_id: COMPANY_ID_TO_NAME[user_id]
    )
df_airlines_popularity.index.name = "Airlines"
df_airlines_popularity

## Information regarding users

In [None]:
df_users = test_data.groupby("user_id")
df_users = df_users.agg(
    user_creation_time=("user_creation_time", "min"),
    verified=("verified", "min"),
    followers_count=("followers_count", "min"),
    friends_count=("friends_count", "min"),
    statuses_count=("statuses_count", "min"),
    default_profile=("default_profile", "min"),
    default_profile_image=("default_profile_image", "max"),
    first_tweet=("tweet_creation_time", "min"),
    last_tweet=("tweet_creation_time", "max"),
    possibly_sensitive=("possibly_sensitive", "sum"),
    favorite_count=("favorite_count", "sum"),
    retweet_count=("retweet_count", "sum"),
    reply_count=("reply_count", "sum"),
    quote_count=("quote_count", "sum"),
    lang=("lang", "first")
)
df_users.head()

### Custom user "trustworthiness" classification

## Tweets information

# Conversations

In [None]:
convo_special = test_data.sort_values("tweet_creation_time", ascending=False)[["user_id", "replied_tweet_id", "quoted_status_id"]]
convo_special

In [None]:
# v1
from tqdm import tqdm
from collections import defaultdict

class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, conversation):
        node = self.root
        for tweet_id in conversation:
            node = node.children[tweet_id]
        node.is_end = True

    def is_subset(self, conversation):
        node = self.root
        for tweet_id in conversation:
            if tweet_id not in node.children:
                return False
            node = node.children[tweet_id]
        return True



def trace_conversation(start_tweet_id, tweet_dict):
    convo = []
    current_tweet_id = start_tweet_id
    users_in_conversation = set()
    local_processed_tweet_ids = set()  # Local set to track the current conversation
    while current_tweet_id:
        if current_tweet_id not in tweet_dict or current_tweet_id in local_processed_tweet_ids:
            break
        tweet_info = tweet_dict[current_tweet_id]
        convo.append(current_tweet_id)
        users_in_conversation.add(tweet_info['user_id'])
        local_processed_tweet_ids.add(current_tweet_id)
        if len(users_in_conversation) > 2:
            return None  # More than two users, not an exclusive conversation
        current_tweet_id = tweet_info['replied_tweet_id']
    return convo[::-1] if len(users_in_conversation) == 2 else None


def extract_and_filter_conversations(df):
    tweet_dict = df.to_dict('index')
    conversations = []
    trie = Trie()  # Initialize trie for subset checks

    # Start tracing conversations from tweets that are replies
    for tweet_id in tqdm(df[df['replied_tweet_id'].notnull()].index,
                         desc="Extracting all conversations"):
        if conversation := trace_conversation(tweet_id, tweet_dict):
            if not trie.is_subset(conversation):
                trie.insert(conversation)
                conversations.append(conversation)

    return conversations

In [None]:
conversations = extract_and_filter_conversations(convo_special)

In [None]:
data = []
for convo_num, convo in enumerate(conversations, start=1):
    data.extend((convo_num, tweet_id) for tweet_id in convo)
# Create a DataFrame
df_conversations = pd.DataFrame(data, columns=['Conversation', 'Tweet_ID'])

# Set MultiIndex
df_conversations

In [None]:

# Merge the conversation DataFrame with the test_data DataFrame
df_conversations_full = df_conversations.merge(test_data, left_on='Tweet_ID', right_index=True, how='left')

# Set the MultiIndex again with Conversation and Tweet_ID
df_conversations_full.set_index(['Conversation', 'Tweet_ID'], inplace=True)
df_conversations_full


In [None]:
airline_conversation = df_conversations_full.loc[df_conversations_full.index.get_level_values('Conversation').isin(df_conversations_full[df_conversations_full['user_id'] == COMPANY_NAME_TO_ID["Lufthansa"]].index.get_level_values('Conversation'))]
airline_conversation

In [None]:
airline_conversation = airline_conversation.reset_index()
airline_conversation['New_Conversation'] = pd.factorize(airline_conversation['Conversation'])[0] + 1
airline_conversation = airline_conversation.set_index(['New_Conversation', 'Tweet_ID'])
airline_conversation = airline_conversation.sort_index(level='New_Conversation')
airline_conversation

# Sentiment Analysis

In [None]:
#Remove the error message
# transformers_logging.set_verbosity_error()
# logging.getLogger("transformers").setLevel(logging.ERROR)

#Load the tokenizer and model once
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

#Check if GPU is available
if tf.test.is_gpu_available():
    device = '/GPU:0'
else:
    device = '/CPU:0'

#Set the labels for sentiment results
labels = {
    0 : 'negative',
    1 : 'neutral',
    2 : 'positive'
}

def process_batch(texts):
    """
    Apply sentiment analysis to a batch of texts using a pre-trained transformer model.

    Parameters:
    texts (list): A list of texts to analyze.

    Returns:
    list: A list of ranked sentiment labels for the input texts.

    This function uses a pre-trained transformer model for sequence classification to analyze the sentiment of the texts in the given batch. It returns a list of ranked sentiment labels for the input texts, where the first label is the most likely sentiment and the subsequent labels are less likely sentiments.
    """
    encoded_input = tokenizer(texts, return_tensors='tf', padding=True, truncation=True)
    with tf.device(device):
        output = model(encoded_input)
    scores = output[0].numpy()
    scores = softmax(scores, axis=1)
    rankings = np.argsort(scores, axis=1)[:, ::-1]
    return [labels[rank[0]] for rank in rankings]

def apply_sentiment_analysis(df, text_column, batch_size=64, max_workers=14):
    """
    Apply sentiment analysis to the given DataFrame using a pre-trained transformer model.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the text column to analyze.
    text_column (str): The name of the column in the DataFrame containing the text to analyze.
    batch_size (int): The number of texts to process in each batch. Default is 128.
    max_workers (int): The maximum number of worker threads to use for parallel processing. Default is 4.

    Returns:
    pd.DataFrame: The input DataFrame with an additional 'sentiment' column containing the sentiment analysis results.

    This function uses a pre-trained transformer model for sequence classification to analyze the sentiment of the texts in the given DataFrame. It applies the sentiment analysis in parallel using multiple worker threads to improve performance. The results are then added to the input DataFrame as a new 'sentiment' column.
    """
    texts = df[text_column].tolist()
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            results.extend(executor.submit(process_batch, batch).result())

    df['sentiment'] = results
    return df


In [None]:
pd.set_option('display.max_colwidth', None)  
full_text_df = airline_conversation[['full_text']].copy()
airline_conversation_sample = full_text_df.head(100)
airline_conversation_sample = apply_sentiment_analysis(airline_conversation_sample, 'full_text')
airline_conversation_sample
