In [26]:
import os
import sqlite3
import pandas as pd

def get_local_data(query: str, path: str) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        df = pd.read_sql_query(query, connection,
                                   dtype={k: v for k, v in DTYPES.items() if k not in ("tweet_creation_time", "user_creation_time")},
                                   index_col='tweet_id')
        df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
        df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
    
    return df

QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Tweets.replied_tweet_id,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id as tweet_id,
    Tweets.full_text
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""

DTYPES = {
"user_id": "object",
"user_creation_time": "datetime64[ns]",
"tweet_creation_time": "datetime64[ns]",
"tweet_id": "object",
"full_text": "object",
}



path =  os.path.join(
        os.path.dirname(
            os.getcwd()
            ),
    "data_processed", "local_backup.db")
path
test_data = get_local_data(QUERY_ALL, path)

test_data = test_data.reset_index()

# DONE!!!!!!!!!

airlines_id = ['56377143','106062176','18332190','22536055','124476322','26223583',
               '2182373406','38676903','1542862735','253340062','218730857','20626359',
               '45621423']

# List of airlines to search for in text
airlines = ['KLM', 'Lufthansa', 'American Air', 'RyanAir', 'British Airways', 'EasyJet', 
            'Air France', 'Singapore Airlines', 'Qantas', 'Etihad Airways', 'Air Berlin',
            'BritishAirways', 'AirFrance', 'SingaporeAirlines', 'EtihadAirways', 'AirBerlin',
            'AmericanAir', 'VirginAtlantic', 'Virgin Atlantic']




In [32]:
import re 

def create_pattern(airline):
    # Escape the airline name
    escaped_name = re.escape(airline)
    # Replace escaped spaces with a pattern that matches either space or underscore
    pattern = escaped_name.replace(r'\ ', r'[_ ]')
    # Add an optional "@" at the beginning and allow for case insensitivity
    return r'@?' + pattern

# Create regex patterns for all airlines
patterns = [create_pattern(airline) for airline in airlines]
keywords = r'\b(?:' + '|'.join(patterns) + r')\b'

# Step 1: Check for mentions of any airline, case-insensitive
test_data['contains_airline'] = test_data['full_text'].str.contains(keywords, case=False, regex=True)

# Step 2: Filter out tweets where contains_airline is True
filtered_data = test_data[~test_data['contains_airline']]

# Step 3: Filter out tweets that are replies to airlines
filtered_data = filtered_data[(filtered_data['replied_tweet_id'] == '') | (~filtered_data['replied_tweet_id'].isin(airlines_id))]
filtered_data = filtered_data[~filtered_data['user_id'].isin(airlines_id)]


# Print counts
print(f"Mentions that CONTAIN AIRLINE: {test_data['contains_airline'].sum()}")
print(f"Non-mentions that DO NOT CONTAIN AIRLINE: {len(test_data) - test_data['contains_airline'].sum()}")

print(len(test_data), 'no of tweets in test_data')
print(len(filtered_data), 'no of tweets in filtered data')

# Display the output columns
print(filtered_data[['tweet_id', 'user_id', 'replied_tweet_id', 'full_text', 'contains_airline']])

Mentions that CONTAIN AIRLINE: 4469547
Non-mentions that DO NOT CONTAIN AIRLINE: 1678558
6148105 no of tweets in test_data
1285687 no of tweets in filtered data
                    tweet_id              user_id     replied_tweet_id  \
2        1131172864147808257           3420691215                 None   
17       1131166315400577024           2383996850                 None   
18       1131173222039412736           2900385661                 None   
21       1131118866082480129             30231490                 None   
25       1131173335545593856            224916583                 None   
...                      ...                  ...                  ...   
6148065  1244696446760296448            437502803  1244694218108538881   
6148069  1244696488455921670            355187133                 None   
6148072  1244002553349758977            129936509                 None   
6148081  1244696565203316742  1179195262625026048                 None   
6148100  1244696703690772

In [31]:
import pandas as pd
import re


# Airlines list
airlines = [
    'KLM', 'Lufthansa', 'American Air', 'RyanAir', 'British Airways', 'EasyJet', 
    'Air France', 'Singapore Airlines', 'Qantas', 'Etihad Airways', 'Air Berlin',
    'BritishAirways', 'AirFrance', 'SingaporeAirlines', 'EtihadAirways', 'AirBerlin',
    'AmericanAir', 'VirginAtlantic', 'Virgin Atlantic'
]

# Function to clean mentions not in the airlines list
def clean_mentions(text):
    # Regex pattern to find mentions
    mention_pattern = r'@([A-Za-z0-9_]+)'
    mentions = re.findall(mention_pattern, text)
    
    # Check if each mention is in the airlines list
    valid_mentions = [f"@{mention}" for mention in mentions if mention in airlines]
    
    # Replace invalid mentions in the text
    for mention in mentions:
        if mention not in airlines:
            text = text.replace(f"@{mention}", "")
    
    return text

# Apply the cleaning function to the DataFrame
test_data['cleaned_text'] = test_data['full_text'].apply(clean_mentions)

# Display the original and cleaned tweets
print("Original Tweets:")
print(test_data['full_text'])
print("\nCleaned Tweets:")
print(test_data['cleaned_text'])


Original Tweets:
0          La ruta de easyJet entre Londres y Menorca tra...
1          @goody_tracy Here’s a list of some of @JonesDa...
2          RT @bttr_as1: @goody_tracy Here’s a list of so...
3                                           @British_Airways
4          Nice change by @AmericanAir. Bikes now pay sta...
                                 ...                        
6148100    RT @jfergo86: Me parece a mí o el avión es más...
6148101    Today’s random pic of the day is the one of Vo...
6148102    RT @SchipholWatch: @spbverhagen @markduursma @...
6148103    RT @wiltingklaas: Tweede Kamer stemt over vlie...
6148104    @easyJet My refund is being process since two ...
Name: full_text, Length: 6148105, dtype: object

Cleaned Tweets:
0          La ruta de easyJet entre Londres y Menorca tra...
1           Here’s a list of some of  clients. They shoul...
2          RT :  Here’s a list of some of  clients. They ...
3                                                           
4  