In [1]:
import os
import sqlite3
import pandas as pd
from tqdm import tqdm
import re 

def get_local_data(query: str, path: str) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        df = pd.read_sql_query(query, connection,
                                   dtype={k: v for k, v in DTYPES.items() if k not in ("tweet_creation_time", "user_creation_time")},
                                   index_col='tweet_id')
        df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
        df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
    
    return df

QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Tweets.replied_tweet_id,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id as tweet_id,
    Tweets.full_text
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""

DTYPES = {
"user_id": "object",
"user_creation_time": "datetime64[ns]",
"tweet_creation_time": "datetime64[ns]",
"tweet_id": "object",
"full_text": "object",
}



path =  os.path.join(
        os.path.dirname(
            os.getcwd()
            ),
    "data_processed", "local_backup.db")
test_data = get_local_data(QUERY_ALL, path)

test_data = test_data.reset_index()

# DONE!!!!!!!!!

airlines_id = ['56377143','106062176','18332190','22536055','124476322','26223583',
               '2182373406','38676903','1542862735','253340062','218730857','20626359',
               '45621423']

# List of airlines to search for in text
airlines = ['KLM', 'Lufthansa', 'American Air', 'RyanAir', 'British Airways', 'EasyJet', 
            'Air France', 'Singapore Airlines', 'Qantas', 'Etihad Airways', 'Air Berlin',
            'BritishAirways', 'AirFrance', 'SingaporeAirlines', 'EtihadAirways', 'AirBerlin',
            'AmericanAir', 'VirginAtlantic', 'Virgin Atlantic']




In [2]:

def create_pattern(airline):
    # Escape the airline name
    escaped_name = re.escape(airline)
    # Replace escaped spaces with a pattern that matches either space or underscore
    pattern = escaped_name.replace(r'\ ', r'[_ ]')
    # Add an optional "@" at the beginning and allow for case insensitivity
    return f'@?{pattern}'

# Create regex patterns for all airlines
patterns = [create_pattern(airline) for airline in airlines]
keywords = r'\b(?:' + '|'.join(patterns) + r')\b'

# Step 1: Check for mentions of any airline, case-insensitive
test_data['contains_airline'] = test_data['full_text'].str.contains(keywords, case=False, regex=True)

# Step 2: Filter out tweets where contains_airline is True
filtered_data = test_data[~test_data['contains_airline']]

# Step 3: Filter out tweets that are replies to airlines
filtered_data = filtered_data[(filtered_data['replied_tweet_id'] == '') | (~filtered_data['replied_tweet_id'].isin(airlines_id))]
filtered_data = filtered_data[~filtered_data['user_id'].isin(airlines_id)]


# Print counts
print(f"Mentions that CONTAIN AIRLINE: {test_data['contains_airline'].sum()}")
print(f"Non-mentions that DO NOT CONTAIN AIRLINE: {len(test_data) - test_data['contains_airline'].sum()}")

print(len(test_data), 'no of tweets in test_data')
print(len(filtered_data), 'no of tweets in filtered data')

# Display the output columns
print(filtered_data[['tweet_id', 'user_id', 'replied_tweet_id', 'full_text', 'contains_airline']])

Mentions that CONTAIN AIRLINE: 4469547
Non-mentions that DO NOT CONTAIN AIRLINE: 1678558
6148105 no of tweets in test_data
1285687 no of tweets in filtered data
                    tweet_id              user_id     replied_tweet_id  \
2        1131172864147808257           3420691215                 None   
17       1131166315400577024           2383996850                 None   
18       1131173222039412736           2900385661                 None   
21       1131118866082480129             30231490                 None   
25       1131173335545593856            224916583                 None   
...                      ...                  ...                  ...   
6148065  1244696446760296448            437502803  1244694218108538881   
6148069  1244696488455921670            355187133                 None   
6148072  1244002553349758977            129936509                 None   
6148081  1244696565203316742  1179195262625026048                 None   
6148100  1244696703690772

In [3]:

# Airlines list
airlines = [
    'KLM', 'Lufthansa', 'American Air', 'RyanAir', 'British Airways', 'EasyJet', 
    'Air France', 'Singapore Airlines', 'Qantas', 'Etihad Airways', 'Air Berlin',
    'BritishAirways', 'AirFrance', 'SingaporeAirlines', 'EtihadAirways', 'AirBerlin',
    'AmericanAir', 'VirginAtlantic', 'Virgin Atlantic'
]

# Function to clean mentions not in the airlines list
def clean_mentions(text):
    # Regex pattern to find mentions
    mention_pattern = r'@([A-Za-z0-9_]+)'
    mentions = re.findall(mention_pattern, text)
    
    # Check if each mention is in the airlines list
    valid_mentions = [f"@{mention}" for mention in mentions if mention in airlines]
    
    # Replace invalid mentions in the text
    for mention in mentions:
        if mention not in airlines:
            text = text.replace(f"@{mention}", "")
    
    return text

# Apply the cleaning function to the DataFrame
test_data['cleaned_text'] = test_data['full_text'].apply(clean_mentions)

# Display the original and cleaned tweets
print("Original Tweets:")
print(test_data['full_text'])
print("\nCleaned Tweets:")
print(test_data['cleaned_text'])


Original Tweets:
0          La ruta de easyJet entre Londres y Menorca tra...
1          @goody_tracy Here’s a list of some of @JonesDa...
2          RT @bttr_as1: @goody_tracy Here’s a list of so...
3                                           @British_Airways
4          Nice change by @AmericanAir. Bikes now pay sta...
                                 ...                        
6148100    RT @jfergo86: Me parece a mí o el avión es más...
6148101    Today’s random pic of the day is the one of Vo...
6148102    RT @SchipholWatch: @spbverhagen @markduursma @...
6148103    RT @wiltingklaas: Tweede Kamer stemt over vlie...
6148104    @easyJet My refund is being process since two ...
Name: full_text, Length: 6148105, dtype: object

Cleaned Tweets:
0          La ruta de easyJet entre Londres y Menorca tra...
1           Here’s a list of some of  clients. They shoul...
2          RT :  Here’s a list of some of  clients. They ...
3                                                           
4  

In [4]:
test_data[['tweet_id', 'cleaned_text']]

Unnamed: 0,tweet_id,cleaned_text
0,1131172858951024641,La ruta de easyJet entre Londres y Menorca tra...
1,1130922003702177800,Here’s a list of some of clients. They shoul...
2,1131172864147808257,RT : Here’s a list of some of clients. They ...
3,1131172867985485824,
4,1131030279278063616,Nice change by @AmericanAir. Bikes now pay sta...
...,...,...
6148100,1244696703690772485,RT : Me parece a mí o el avión es más grande q...
6148101,1244696708983984131,Today’s random pic of the day is the one of Vo...
6148102,1244696710447800320,RT : @KLM Nog niet aan de orde? Als in: e...
6148103,1244696713350217728,RT : Tweede Kamer stemt over vliegtaks https:/...


In [27]:



def check_given_var(env_var_str: str) -> str:
    """
    Check if the given environment variable is set and return its value.

    Args:
        env_var_str (str): The name of the environment variable to check.

    Returns:
        str: The value of the environment variable.

    Raises:
        AssertionError: If the environment variable is not found.
    """

    env_var = os.getenv(env_var_str)
    assert (
        env_var is not None
    ), f"{env_var_str} is required but not found in environment variables"
    return env_var


def check_env_vars() -> (str, str, str, str):  # type: ignore
    user = check_given_var("DBL_USER")
    database = check_given_var("DBL_DATABASE")
    password = check_given_var("DBL_PASSWORD")
    host = check_given_var("DBL_HOST")
    return user, database, password, host

from mysql.connector import Error
import mysql.connector
from typing import Tuple, List

def connect_to_database(user: str, database: str, password: str, host: str):
    """
    Establish a connection to the database.

    Args:
        user: The database user.
        database: The name of the database.
        password: The password for the database.
        host: The database host.

    Returns:
        A connection object to the MySQL database.
    """
    try:
        connection = mysql.connector.connect(
            user=user, password=password, host=host, database=database
        )
        if connection.is_connected():
            return connection
    except Error as e:
        print(f"Error while connecting to MySQL: {e}")
    return None


def get_batches(df: pd.DataFrame, batch_size: int = 1000) -> List[pd.DataFrame]:
    """
    Split DataFrame into batches of DataFrames with specified batch size.

    Args:
        df: The DataFrame containing tweet data.
        batch_size: The size of each batch.

    Returns:
        A list of DataFrames, each containing a batch of rows.
    """
    return [df.iloc[i : i + batch_size] for i in range(0, len(df), batch_size)]


def update_text(
    batch: Tuple[Tuple[float, str]], user: str, database: str, password: str, host: str
) -> None:
    """
    Create and insert batches of tweets into the database in parallel.

    Args:
        batches_list: Tuple of (sentiment, tweet_id) pairs.
        user: The database user.
        database: The name of the database.
        password: The password for the database.
        host: The database host.
    """
    connection = connect_to_database(user, database, password, host)
    if connection is None:
        return
    cursor = connection.cursor()
    update_query = "UPDATE Tweets SET full_text = %s WHERE tweet_id = %s"
    cursor.executemany(update_query, batch)
    connection.commit()
    cursor.close()
    connection.close()

    print("Sentiment values updated successfully")


import sqlite3
def connect_to_local_database(db_path: str):
    """
    Establish a connection to the local SQLite database.

    Args:
        db_path: The path to the SQLite database file.

    Returns:
        A connection object to the SQLite database.
    """
    try:
        return sqlite3.connect(db_path)
    except sqlite3.Error as e:
        print(f"Error while connecting to SQLite: {e}")
    return None


def update_text_local(
    batch: List[Tuple[str, str]], db_path: str
) -> None:
    """
    Update full_text values for a batch of data in the local SQLite database.

    Args:
        batch: List of (full_text, tweet_id) pairs.
        db_path: The path to the SQLite database file.
    """
    connection = connect_to_local_database(db_path)
    if connection is None:
        return
    try:
        cursor = connection.cursor()
        update_query = "UPDATE Tweets SET full_text = ? WHERE tweet_id = ?"
        cursor.executemany(update_query, batch)
        connection.commit()
    except sqlite3.Error as e:
        print(f"Error updating batch: {e}")
    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()

def convert_to_list(df: pd.DataFrame) -> List[List]:
    """
    Convert DataFrame with tweet_id as index to a list of lists containing sentiment and tweet_id.

    Args:
        df: The DataFrame with tweet_id as index and sentiment as a column.

    Returns:
        A list of lists containing tweet_id and sentiment.
    """
    tweet_ids = df.index.to_numpy()
    sentiments = df["cleaned_text"].to_numpy()
    return tuple(zip(sentiments, tweet_ids))


In [6]:
to_load = test_data[['tweet_id', 'cleaned_text']].set_index("tweet_id")

In [7]:
data_batches = get_batches(to_load, 10_000)


In [13]:
convert_to_list(data_batches[0])

(('La ruta de easyJet entre Londres y Menorca transporta a más de 19.000 pasajeros en un año https://t.co/Rqy606KVna https://t.co/buWgtqYwCD',
  '1131172858951024641'),
 (' Here’s a list of some of  clients. They should know Jones Day encouraged McGann to break the law:\n \n@AmericanAir \n {surprise face}\n\n \n\n \n \n \n \n \n\nMore to come. Please RT',
  '1130922003702177800'),
 ('RT :  Here’s a list of some of  clients. They should know Jones Day encouraged McGann to break the law:\n…',
  '1131172864147808257'),
 ('', '1131172867985485824'),
 ('Nice change by @AmericanAir. Bikes now pay standard checked bag fees. 👍 made a similar change 2 years ago. \n\nYour turn  https://t.co/w4pZcSHVwX',
  '1131030279278063616'),
 ('RT : Nice change by @AmericanAir. Bikes now pay standard checked bag fees. 👍 made a similar change 2 years ago. \n\nYo…',
  '1131172909463027720'),
 ('BREAKING:-\nKLM to fly 3x weekly btw  and  from winter schedule 2018/19 using Boeing 787-9 aircraft.\n  ',
  '1131170

In [28]:
for batch in tqdm(data_batches, desc="Updating text: "):
    update_text_local(convert_to_list(batch), os.path.join(
        os.path.dirname(
            os.getcwd()
        ),
    "data_processed", "local_backup.db"))

Updating text: 100%|██████████| 615/615 [00:31<00:00, 19.64it/s]
