In [1]:
import os
import sys
import pandas as pd
from tqdm.notebook import tqdm
import conversation_algorithm


sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "_0_Constants_and_Utils"))

from viz_constants import COMPANY_ID_TO_NAME
from database_utils import get_dataframe_from_query, form_connection_params, split_into_batches, execute_queries, connect_to_database

In [2]:
QUERY_CONVO_EXTRACT = """
SELECT 
    Tweets.tweet_id,
    Users.user_id AS user_id, 
    Tweets.replied_tweet_id
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id
ORDER BY Tweets.creation_time DESC;
"""


DTYPES = {
"user_id": "object",
"tweet_id": "object",
"replied_tweet_id": "object",
}

# Extracting all the IDs into a list
company_ids = set(COMPANY_ID_TO_NAME.keys())

In [3]:
# Set local = False if you want to query the online MySQL database
local = True
connection_params = form_connection_params(local, True)

In [4]:
test_data = get_dataframe_from_query(QUERY_CONVO_EXTRACT, connection_params, local, DTYPES, "tweet_id")
test_data

Unnamed: 0_level_0,user_id,replied_tweet_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1244696713350217728,1223576386432126976,
1244696713765564416,56784613,
1244696708983984131,246520593,
1244696710447800320,109284383,
1244696703690772485,278698748,
...,...,...
773181150,10812972,
773176947,10812972,
773176924,10812972,
773176134,10812972,


In [5]:
convo_special = test_data[["user_id", "replied_tweet_id"]]
convo_special

Unnamed: 0_level_0,user_id,replied_tweet_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1244696713350217728,1223576386432126976,
1244696713765564416,56784613,
1244696708983984131,246520593,
1244696710447800320,109284383,
1244696703690772485,278698748,
...,...,...
773181150,10812972,
773176947,10812972,
773176924,10812972,
773176134,10812972,


In [6]:
conversations = conversation_algorithm.extract_conversations(convo_special, company_ids)

Extracting conversations:   0%|          | 0/1795409 [00:00<?, ?it/s]

In [7]:
conversations

[['1244694453190897664', '1244696682979303426'],
 ['1242875007270891523', '1244696352090656770'],
 ['1244663027452071936', '1244696298638450696'],
 ['1244550514970329088', '1244553548668579852', '1244696257781805056'],
 ['1244683000195022855', '1244696213552758787'],
 ['1244542518987014144', '1244542697366532096', '1244696138512556033'],
 ['1244695110639632386', '1244696125833175041'],
 ['1244542518987014144', '1244696104341471234'],
 ['1239668797218402305',
  '1241034899156545542',
  '1241147545096765443',
  '1242202731919675397',
  '1242339956263182336',
  '1244683643416698880',
  '1244689590549647361',
  '1244695969918222338'],
 ['1241498515039272960', '1244695952151240705'],
 ['1238774159498522624', '1244695827269984256'],
 ['1244689059072573440', '1244695718792699905'],
 ['1244692213369577473', '1244695542711693312', '1244695647271485448'],
 ['1244679105427181570', '1244689147559804928', '1244695585833332737'],
 ['1244690625586434049', '1244692785690787841', '1244695187529629696']

# Uploading conversations

In [8]:
rows = []

# Loop through each conversation
for conv_id, conv in enumerate(conversations, start=1):
    # Loop through each tweet in the conversation
    rows.extend(
        (conv_id, order, tweet_id)
        for order, tweet_id in enumerate(conv, start=1)
    )
# Create a DataFrame from the rows
df = pd.DataFrame(rows, columns=['conversation_id', 'tweet_order', 'tweet_id'])
df

Unnamed: 0,conversation_id,tweet_order,tweet_id
0,1,1,1244694453190897664
1,1,2,1244696682979303426
2,2,1,1242875007270891523
3,2,2,1244696352090656770
4,3,1,1244663027452071936
...,...,...,...
1259526,458722,3,452657442057646080
1259527,458723,1,451124070730719233
1259528,458723,2,451125255294443521
1259529,458724,1,430790355962052608


In [9]:
def upload_data(conversations: pd.DataFrame, connection_params: dict, local: bool, batch_size: int) -> None:
    insertion_conversations = """
      INSERT OR IGNORE INTO Conversations(conversation_id, tweet_order, tweet_id)
      VALUES(?, ?, ?);
    """
    insert_category = """
      INSERT OR IGNORE INTO ConversationsCategory(conversation_id, category)
      VALUES(?, ?);
    """
    if not local:
        insertion_conversations = insertion_conversations.replace("?", "%s").replace(" OR", "")
        insert_category = insert_category.replace("?", "%s").replace(" OR", "")

    conversations_upload = conversations.values.tolist()
    categories_to_upload = [[row, "Undefined category"] for row in list({row[0] for row in conversations_upload})]
    connection = connect_to_database(connection_params, local)
    for batch in tqdm(split_into_batches(conversations_upload, batch_size),
                      desc="Uploading conversations: "):
        execute_queries(connection, [(insertion_conversations, batch)])
    for batch in tqdm(split_into_batches(categories_to_upload, batch_size),
                      desc="Uploading categories: "):
        execute_queries(connection, [(insert_category, batch)])
    connection.close()

    

In [10]:
batch_size = 100_000
upload_data(df, connection_params, local, batch_size)

Uploading conversations: : 0it [00:00, ?it/s]

Uploading categories: : 0it [00:00, ?it/s]