In [1]:
import contextlib
import os
import sqlite3

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pycountry
import seaborn as sns

In [2]:
QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id,
    Tweets.full_text,
    Tweets.lang,
    Tweets.country_code,
    Tweets.sentiment_score
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""

QUERY_CONVERSATIONS = """
SELECT * FROM Conversations;
"""

DTYPES = {
"user_id": "object",
"tweet_id": "object",
"full_text": "object",
"lang": "category",
"country_code": "category",
"sentiment_score": "float32",
}

COMPANY_NAME_TO_ID = {
    "Klm": "56377143",
    "Air France": "106062176",
    "British Airways": "18332190",
    "American Air": "22536055",
    "Lufthansa": "124476322",
    "Air Berlin": "26223583",
    "Air Berlin assist": "2182373406",
    "easyJet": "38676903",
    "Ryanair": "1542862735",
    "Singapore Airlines": "253340062",
    "Qantas": "218730857",
    "Etihad Airways": "45621423",
    "Virgin Atlantic": "20626359",
}

COMPANY_ID_TO_NAME = {v: k for k, v in COMPANY_NAME_TO_ID.items()}

In [3]:
def get_full_language_name(language_code: str,
                           default: str="Undefined Language") -> str:
    """
    Convert a two-letter language code (ISO 639-1) to its full language name.
    
    Parameters:
    language_code (str): The two-letter ISO 639-1 language code.
    
    Returns:
    str: The full name of the language or a message indicating the code was not found.
    """
    if language_code=="Other languages":
        return language_code
    language = pycountry.languages.get(alpha_2=language_code, default=default)
    if language != default:
        language = language.name
    return language


def get_country_name(country_code: str, default: str="Unknown Country") -> str:
    """
    Convert a two-letter country code (ISO 3166-1 alpha-2|) to its full country name.
    
    Parameters:
    country_code (str): The two-letter ISO 3166-1 alpha-2 country code.
    
    Returns:
    str: The full name of the country or a message indicating the code was not found.
    """
    country = pycountry.countries.get(alpha_2=country_code, default=default)
    if country != default:
        country = country.name
    return country

def get_tweets_with_users(query: str, path: str) -> pd.DataFrame:
    with sqlite3.connect(path) as connection:
        return pd.read_sql_query(query, connection,
                                 dtype=DTYPES,
                                 parse_dates=["tweet_creation_time", "user_creation_time"],
                                 index_col='tweet_id')


def get_conversations(query: str, path: str) -> pd.DataFrame:
    with sqlite3.connect(path) as connection:
        return pd.read_sql_query(query, connection,
                                 dtype={"conversation_id": "object", "tweet_order": "int16", "tweet_id": "object"},
                                 index_col=["conversation_id", "tweet_order"])

In [4]:
path =  os.path.join(
    os.path.dirname(
            os.getcwd()
        ),
    "data_processed", "local_backup.db")
df_tweets_and_users = get_tweets_with_users(QUERY_ALL, path)
df_conversations = get_conversations(QUERY_CONVERSATIONS, path)

In [5]:
df_tweets_and_users

Unnamed: 0_level_0,user_id,user_creation_time,tweet_creation_time,full_text,lang,country_code,sentiment_score
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1131172858951024641,393374091,2011-10-18 12:55:25+00:00,2019-05-22 12:20:00+00:00,La ruta de easyJet entre Londres y Menorca tra...,es,un,-0.037224
1130922003702177800,880417607865815040,2017-06-29 13:28:09+00:00,2019-05-21 19:43:11+00:00,@goody_tracy Here’s a list of some of @JonesDa...,en,un,-0.045324
1131172864147808257,3420691215,2015-08-13 19:18:07+00:00,2019-05-22 12:20:01+00:00,RT @bttr_as1: @goody_tracy Here’s a list of so...,en,un,-0.051741
1131172867985485824,394376606,2011-10-20 00:02:49+00:00,2019-05-22 12:20:02+00:00,@British_Airways,und,un,-0.033292
1131030279278063616,227687574,2010-12-17 14:37:53+00:00,2019-05-22 02:53:26+00:00,Nice change by @AmericanAir. Bikes now pay sta...,en,un,-0.047510
...,...,...,...,...,...,...,...
1244696703690772485,278698748,2011-04-07 19:55:35+00:00,2020-03-30 18:43:14+00:00,RT @jfergo86: Me parece a mí o el avión es más...,es,un,-0.386010
1244696708983984131,246520593,2011-02-02 23:06:38+00:00,2020-03-30 18:43:15+00:00,Today’s random pic of the day is the one of Vo...,en,un,0.872379
1244696710447800320,109284383,2010-01-28 15:09:19+00:00,2020-03-30 18:43:15+00:00,RT @SchipholWatch: @spbverhagen @markduursma @...,nl,un,-0.553437
1244696713350217728,1223576386432126976,2020-02-01 11:59:19+00:00,2020-03-30 18:43:16+00:00,RT @wiltingklaas: Tweede Kamer stemt over vlie...,nl,un,-0.043661


In [6]:
df_conversations

Unnamed: 0_level_0,Unnamed: 1_level_0,tweet_id
conversation_id,tweet_order,Unnamed: 2_level_1
1,1,1244694453190897664
1,2,1244696682979303426
2,1,1244677304598609923
2,2,1244696641401163776
3,1,1244644204132909060
...,...,...
493694,3,452657442057646080
493695,1,451124070730719233
493695,2,451125255294443521
493696,1,430790355962052608
