In [None]:
import os
import sqlite3
import pandas as pd

def get_local_data(query: str, path: str) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        df = pd.read_sql_query(query, connection,
                                   dtype={k: v for k, v in DTYPES.items() if k not in ("tweet_creation_time", "user_creation_time")},
                                   index_col='tweet_id')
        df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
        df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
    
    return df

QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Tweets.replied_tweet_id,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id as tweet_id
    Tweets.full_text
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""

DTYPES = {
"user_id": "object",
"user_creation_time": "datetime64[ns]",
"tweet_creation_time": "datetime64[ns]",
"tweet_id": "object",
"full_text": "object",
}



path =  os.path.join(
        os.path.dirname(
            os.getcwd()
            ),
    "data_processed", "local_backup.db")
path
test_data = get_local_data(QUERY_ALL, path)

test_data = test_data.reset_index()

# DONE!!!!!!!!!

airlines_id = ['56377143','106062176','18332190','22536055','124476322','26223583',
               '2182373406','38676903','1542862735','253340062','218730857','20626359',
               '45621423']

# List of airlines to search for in text
airlines = ['KLM', 'Lufthansa', 'American Air', 'RyanAir', 'British Airways', 'EasyJet', 
            'Air France', 'Singapore Airlines', 'Qantas', 'Etihad Airways', 'Air Berlin',
            'BritishAirways', 'AirFrance', 'SingaporeAirlines', 'EtihadAirways', 'AirBerlin',
            'AmericanAir', 'VirginAtlantic', 'Virgin Atlantic']


def create_pattern(airline):
    # Escape the airline name
    escaped_name = re.escape(airline)
    # Replace escaped spaces with a pattern that matches either space or underscore
    pattern = escaped_name.replace(r'\ ', r'[_ ]')
    # Add an optional "@" at the beginning and allow for case insensitivity
    return r'@?' + pattern

# Create regex patterns for all airlines
patterns = [create_pattern(airline) for airline in airlines]
keywords = r'\b(?:' + '|'.join(patterns) + r')\b'

# Step 1: Check for mentions of any airline, case-insensitive
test_data['contains_airline'] = test_data['full_text'].str.contains(keywords, case=False, regex=True)

# Step 2: Filter out tweets where contains_airline is True
filtered_data = test_data[~test_data['contains_airline']]

# Step 3: Filter out tweets that are replies to airlines
filtered_data = filtered_data[(filtered_data['replied_tweet_id'] == '') | (~filtered_data['replied_tweet_id'].isin(airlines_id))]
filtered_data = filtered_data[~filtered_data['user_id'].isin(airlines_id)]


# Print counts
print(f"Mentions that CONTAIN AIRLINE: {test_data['contains_airline'].sum()}")
print(f"Non-mentions that DO NOT CONTAIN AIRLINE: {len(test_data) - test_data['contains_airline'].sum()}")

print(len(test_data), 'no of tweets in test_data')
print(len(filtered_data), 'no of tweets in filtered data')
print(f"Difference in number of tweets between test_data and filtered_data (no removed tweets due to mention of airline, or reply, or by the airline): {difference}")

# Display the output columns
print(filtered_data[['tweet_id', 'user_id', 'replied_tweet_id', 'full_text', 'contains_airline']])