In [2]:
import os
from typing import List, Tuple
import sqlite3
import mysql
import mysql.connector
import pandas as pd
from json5 import host
from tqdm.notebook import tqdm
from mysql.connector import Error

In [3]:
def check_given_var(env_var_str: str) -> str:
    """
    Check if the given environment variable is set and return its value.

    Args:
        env_var_str (str): The name of the environment variable to check.

    Returns:
        str: The value of the environment variable.

    Raises:
        AssertionError: If the environment variable is not found.
    """

    env_var = os.getenv(env_var_str)
    assert (
        env_var is not None
    ), f"{env_var_str} is required but not found in environment variables"
    return env_var


def check_env_vars() -> (str, str, str, str):  # type: ignore
    user = check_given_var("DBL_USER")
    database = check_given_var("DBL_DATABASE")
    password = check_given_var("DBL_PASSWORD")
    host = check_given_var("DBL_HOST")
    return user, database, password, host

def connect_to_database(user: str, database: str, password: str, host: str):
    """
    Establish a connection to the database.

    Args:
        user: The database user.
        database: The name of the database.
        password: The password for the database.
        host: The database host.

    Returns:
        A connection object to the MySQL database.
    """
    try:
        connection = mysql.connector.connect(
            user=user, password=password, host=host, database=database
        )
        if connection.is_connected():
            return connection
    except Error as e:
        print(f"Error while connecting to MySQL: {e}")
    return None

In [4]:
db_name = r"C:/Users/20232075\Desktop/local_backup.db"

def extract_local_tweets(db_name: str) -> Tuple[pd.DataFrame]:
    connection = sqlite3.connect(db_name)
    cursor = connection.cursor()
    
    extract_data_1 = """SELECT Tweets.tweet_id, Tweets.full_text, Tweets.lang, Tweets.creation_time as tweet_creation_time,
                                   Tweets.country_code, Tweets.favorite_count, Tweets.retweet_count, Tweets.possibly_sensitive,
                                   Tweets.replied_tweet_id, Tweets.reply_count, Tweets.quoted_status_id, Tweets.quote_count,
                                   Tweets.sentiment_score, Users.user_id, Users.verified, Users.followers_count, Users.friends_count,
                                   Users.statuses_count, Users.creation_time as user_creation_time, Users.default_profile,
                                   Users.default_profile_image
                            FROM Tweets
                            INNER JOIN Users ON Users.user_id = Tweets.user_id;"""
    
    extract_data_2 = """SELECT Conversations.tweet_id, Conversations.conversation_id,  Conversations.tweet_order,
                                   ConversationsCategory.category, ConversationsCategory.confidence
                            FROM Conversations
                            INNER JOIN ConversationsCategory ON Conversations.conversation_id = ConversationsCategory.conversation_id;"""
    
    columns_1 = ['tweet_id', 'full_text', 'lang', 'creation_time', 'country_code', 'favorite_count', 'retweet_count',
                   'possibly_sensitive', 'replied_tweet_id', 'reply_count', 'quoted_status_id', 'quote_count', 'sentiment_score',
                   'user_id', 'verified', 'followers_count', 'friends_count', 'statuses_count', 'user_creation_time',
                   'default_profile', 'default_profile_image']
    cursor.execute(extract_data_1)
    data_1 = cursor.fetchall()
    df1 = pd.DataFrame(data_1, columns=columns_1)
    cursor.close()
    connection.close()

    columns_2 = ['tweet_id', 'conversation_id', 'tweet_order', 'category', 'confidence']
    connection = sqlite3.connect(db_name)
    cursor = connection.cursor()
    cursor.execute(extract_data_2)
    data_2 = cursor.fetchall()
    df2 = pd.DataFrame(data_2, columns=columns_2)
        
    cursor.close()
    connection.close()
    
    
    return df1, df2

In [5]:
# tweets_df = extract_local_data(db_name)
users_tweets_df, conversations_df= extract_local_tweets(db_name)

In [6]:
len(users_tweets_df)

6148105

In [7]:
users_tweets_df

Unnamed: 0,tweet_id,full_text,lang,creation_time,country_code,favorite_count,retweet_count,possibly_sensitive,replied_tweet_id,reply_count,...,quote_count,sentiment_score,user_id,verified,followers_count,friends_count,statuses_count,user_creation_time,default_profile,default_profile_image
0,1131172858951024641,La ruta de easyJet entre Londres y Menorca tra...,es,2019-05-22 12:20:00+00:00,un,0,0,0,,0,...,0,-0.037224,393374091,1,44323,845,73224,2011-10-18 12:55:25+00:00,0,0
1,1130922003702177800,@goody_tracy Here’s a list of some of @JonesDa...,en,2019-05-21 19:43:11+00:00,un,23,33,0,1130615560910254080,2,...,3,-0.045324,880417607865815040,0,2025,2541,22517,2017-06-29 13:28:09+00:00,1,0
2,1131172864147808257,RT @bttr_as1: @goody_tracy Here’s a list of so...,en,2019-05-22 12:20:01+00:00,un,0,0,0,,0,...,0,-0.051741,3420691215,0,1260,1468,38581,2015-08-13 19:18:07+00:00,1,0
3,1131172867985485824,@British_Airways,und,2019-05-22 12:20:02+00:00,un,0,0,0,1131032916232826881,0,...,0,-0.033292,394376606,0,92,215,385,2011-10-20 00:02:49+00:00,1,0
4,1131030279278063616,Nice change by @AmericanAir. Bikes now pay sta...,en,2019-05-22 02:53:26+00:00,un,287,32,0,,11,...,15,-0.047510,227687574,0,34198,1605,17701,2010-12-17 14:37:53+00:00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6148100,1244696703690772485,RT @jfergo86: Me parece a mí o el avión es más...,es,2020-03-30 18:43:14+00:00,un,0,0,0,,0,...,0,-0.386010,278698748,0,1187,1635,48146,2011-04-07 19:55:35+00:00,0,0
6148101,1244696708983984131,Today’s random pic of the day is the one of Vo...,en,2020-03-30 18:43:15+00:00,un,0,0,0,,0,...,0,0.872379,246520593,0,32,53,672,2011-02-02 23:06:38+00:00,1,0
6148102,1244696710447800320,RT @SchipholWatch: @spbverhagen @markduursma @...,nl,2020-03-30 18:43:15+00:00,un,0,0,0,,0,...,0,-0.553437,109284383,0,33,689,1460,2010-01-28 15:09:19+00:00,0,0
6148103,1244696713350217728,RT @wiltingklaas: Tweede Kamer stemt over vlie...,nl,2020-03-30 18:43:16+00:00,un,0,0,0,,0,...,0,-0.043661,1223576386432126976,0,182,411,3798,2020-02-01 11:59:19+00:00,1,0


In [8]:
len(conversations_df)

1346566

In [9]:

conversations_df

Unnamed: 0,tweet_id,conversation_id,tweet_order,category,confidence
0,1244694453190897664,1,1,technical difficulties,0.111812
1,1244696682979303426,1,2,technical difficulties,0.111812
2,1244677304598609923,2,1,booking problems,0.145168
3,1244696641401163776,2,2,booking problems,0.145168
4,1244644204132909060,3,1,booking problems,0.152093
...,...,...,...,...,...
1346561,452657442057646080,493694,3,check-in troubles,0.148291
1346562,451124070730719233,493695,1,check-in troubles,0.122705
1346563,451125255294443521,493695,2,check-in troubles,0.122705
1346564,430790355962052608,493696,1,technical difficulties,0.116650


In [11]:
def convert_tweets_list(df_1: pd.DataFrame, df_2: pd.DataFrame) -> List[Tuple]:
    """
    Convert DataFrame with tweet_id as index to a list of tuples containing sentiment_score and tweet_id.

    Args:
        df: The DataFrame with tweet_id as index and sentiment_score as a column.

    Returns:
        A list of tuples containing sentiment_score and tweet_id.
    """
    tweets_users = df_1[['tweet_id', 'full_text', 'lang', 'creation_time', 'country_code', 'favorite_count', 'retweet_count',
                   'possibly_sensitive', 'replied_tweet_id', 'reply_count', 'quoted_status_id', 'quote_count', 'sentiment_score',
                   'user_id', 'verified', 'followers_count', 'friends_count', 'statuses_count', 'user_creation_time',
                   'default_profile', 'default_profile_image']].values.tolist()
    conversations_list = df_2[['tweet_id', 'conversation_id', 'tweet_order', 'category', 'confidence']].values.tolist()
    return tweets_users, conversations_list
tweets_users_list, conversations_list = convert_tweets_list(users_tweets_df, conversations_df)

In [12]:
len(tweets_users_list)

6148105

In [13]:
len(conversations_list)

1346566

In [14]:
def batch_list(data, batch_size: int):
    """
    Splits a list into smaller batches of a specified size.

    :param data: List of tuples containing sentiment_score and tweet_id.
    :param batch_size: Size of each batch.
    :return: A generator that yields batches of the specified size.
    """
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

batch_tweets_df = batch_list(tweets_users_list, 10_000)
batch_conversations_df = batch_list(conversations_list, 10_000)



In [15]:
def drop_values(user: str, database: str, password: str, host: str):
    deletion_tweets = """DELETE FROM Tweets;"""
    deletion_users = """DELETE FROM Users;"""
    drop_Conversations = """DROP TABLE Conversations;"""
    drop_ConversationsCategory = """DROP TABLE ConversationsCategory;"""
    create_Conversations = """CREATE TABLE IF NOT EXISTS Conversations (
    conversation_id INT NOT NULL,
    tweet_order INT NOT NULL,
    tweet_id VARCHAR(20) NOT NULL,
    PRIMARY KEY (conversation_id, tweet_order)
    );"""
    create_ConversationsCategory = """CREATE TABLE IF NOT EXISTS ConversationsCategory (
    conversation_id INT NOT NULL,
    category ENUM("flight delays and cancellations", "booking problems", "check-in troubles",
    "customer service complaints", "seating and boarding challenges", "in-flight experience",
    "flight information requests", "refund complaints", "safety and security concerns",
    "special assistance requests", "food and beverage complaints", "technical difficulties",
    "promotion and offer issues", "lost luggage", "baggage issues") NOT NULL,
    confidence FLOAT NOT NULL,
    PRIMARY KEY (conversation_id)
    );"""
    connection = connect_to_database(user, database, password, host)
    cursor = connection.cursor()
    cursor.execute(deletion_tweets)
    cursor.execute(deletion_users)
    cursor.execute(drop_Conversations)
    cursor.execute(drop_ConversationsCategory)
    cursor.execute(create_Conversations)
    cursor.execute(create_ConversationsCategory)
    cursor._cnx.commit()
    cursor.close()
    connection.close()

In [17]:
def upload_sentiment(user: str, database: str, password: str, host: str, list_tweets, list_conversations):
    insertion_tweets = """
    INSERT IGNORE INTO Tweets(tweet_id, user_id, full_text, lang, creation_time, country_code, favorite_count,
    retweet_count, possibly_sensitive, replied_tweet_id, reply_count, quoted_status_id, quote_count, sentiment_score)
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
    """

    insertion_user = """
    INSERT IGNORE INTO Users(user_id, verified, followers_count, friends_count,
    statuses_count, creation_time, default_profile, default_profile_image)
    VALUES(%s, %s, %s, %s, %s, %s, %s, %s);
    """
    insertion_conversations = """
    INSERT IGNORE INTO Conversations(conversation_id, tweet_order, tweet_id)
    VALUES(%s, %s, %s);
    """
    insertion_conversations_category = """
    INSERT IGNORE INTO ConversationsCategory(conversation_id, category, confidence)
    VALUES(%s, %s, %s);
    """
    for batch in tqdm(list_tweets):
        batch_users = []
        batch_tweets = []
        for data in batch:
            tweet_data = [data[0], data[13], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], data[12]]
            user_data = [data[13], data[14], data[15], data[16], data[17], data[18], data[19], data[20]]
            batch_tweets.append(tweet_data)
            batch_users.append(user_data)
        connection = connect_to_database(user, database, password, host)
        cursor = connection.cursor()
        cursor.executemany(insertion_user, batch_users)
        cursor.executemany(insertion_tweets, batch_tweets)
        cursor._cnx.commit()
        cursor.close()
        connection.close()
        
    for batch in tqdm(list_conversations):
        batch_conversations = []
        batch_conversations_category = []
        for data in batch:
            conversations_category_data = [data[1], data[3], data[4]]
            conversations_data = [data[1], data[2], data[0],]
            batch_conversations_category.append(conversations_category_data)
            batch_conversations.append(conversations_data)
        connection = connect_to_database(user, database, password, host)
        cursor = connection.cursor()
        cursor.executemany(insertion_conversations_category, batch_conversations_category)
        cursor.executemany(insertion_conversations, batch_conversations)
        cursor._cnx.commit()
        cursor.close()
        connection.close()



In [18]:
user, database, password, host = check_env_vars()
drop_values(user, database, password, host)

In [19]:
upload_sentiment(user, database, password, host, batch_tweets_df, batch_conversations_df)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [20]:
connection = connect_to_database(user, database, password, host)
alter_conversations ="""ALTER TABLE Conversations
ADD FOREIGN KEY (tweet_id) REFERENCES Tweets(tweet_id);"""
cursor = connection.cursor()
cursor.execute(alter_conversations)
cursor.close()
connection.close()

In [21]:
connection = connect_to_database(user, database, password, host)
alter_conversations ="""ALTER TABLE Conversations
ADD FOREIGN KEY (conversation_id) REFERENCES ConversationsCategory(conversation_id);"""
cursor = connection.cursor()
cursor.execute(alter_conversations)
cursor.close()
connection.close()