In [1]:
# it loads everything and then it extracts conversation from the scratch
# TODO: optimise it
import os
import sqlite3

from collections import defaultdict
from typing import List, Tuple

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, conversation):
        node = self.root
        for tweet_id in conversation:
            node = node.children[tweet_id]
        node.is_end = True

    def is_subset(self, conversation):
        node = self.root
        for tweet_id in conversation:
            if tweet_id not in node.children:
                return False
            node = node.children[tweet_id]
        return True

def trace_conversation(start_tweet_id: str, tweet_dict: dict):
    convo = []
    current_tweet_id = start_tweet_id
    users_in_conversation = set()
    local_processed_tweet_ids = set()  # Local set to track the current conversation
    while current_tweet_id:
        if current_tweet_id not in tweet_dict or current_tweet_id in local_processed_tweet_ids:
            break
        tweet_info = tweet_dict[current_tweet_id]
        convo.append(current_tweet_id)
        users_in_conversation.add(tweet_info['user_id'])
        local_processed_tweet_ids.add(current_tweet_id)
        if len(users_in_conversation) > 2:
            return convo[:-1][::-1]  # As soon as the third user appears, we delete his tweet and return
        current_tweet_id = tweet_info['replied_tweet_id']
    return convo[::-1] if len(users_in_conversation) == 2 else None

def extract_and_filter_conversations(df: pd.DataFrame):
    df = df.sort_values("tweet_creation_time", ascending=False)
    df.index = df.index.astype(str)
    tweet_dict = df.to_dict('index')
    conversations = []
    trie = Trie()  # Initialize trie for subset checks

    # Start tracing conversations from tweets that are replies
    for tweet_id in tqdm(df[df['replied_tweet_id'].notnull()].index, desc="Extracting all conversations"):
        if conversation := trace_conversation(tweet_id, tweet_dict):
            if not trie.is_subset(conversation):
                trie.insert(conversation)
                conversations.append(conversation)

    return conversations


def get_local_data(query: str, path: str, dtype: bool = True) -> pd.DataFrame:
    # Connect to the SQLite database using a context manager
    with sqlite3.connect(path) as connection:
        # Read the data into a DataFrame
        if dtype:
            df = pd.read_sql_query(query, connection,
                                   dtype=DTYPES,
                                   index_col='tweet_id')
            df['tweet_creation_time'] = pd.to_datetime(df['tweet_creation_time'])
            df['user_creation_time'] = pd.to_datetime(df['user_creation_time'])
        else:
            df = pd.read_sql_query(query, connection)
    
    return df

In [3]:
QUERY_ALL = """
SELECT 
    Users.user_id AS user_id, 
    Users.creation_time AS user_creation_time, 
    Users.verified,
    Users.followers_count,
    Users.friends_count,
    Users.statuses_count,
    Users.default_profile,
    Users.default_profile_image,
    Tweets.creation_time AS tweet_creation_time,
    Tweets.tweet_id,
    Tweets.full_text,
    Tweets.lang,
    Tweets.country_code,
    Tweets.favorite_count,
    Tweets.retweet_count,
    Tweets.possibly_sensitive,
    Tweets.replied_tweet_id,
    Tweets.reply_count,
    Tweets.quoted_status_id,
    Tweets.quote_count,
    Tweets.sentiment_score
FROM Users
INNER JOIN Tweets ON Users.user_id = Tweets.user_id;
"""


DTYPES = {
"user_id": "object",
"verified": "bool",
"followers_count": "int32",
"friends_count": "int32",
"statuses_count": "int32",
"default_profile": "bool",
"default_profile_image": "bool",
"tweet_id": "object",
"full_text": "object",
"lang": "category",
"country_code": "category",
"favorite_count": "int32",
"retweet_count": "int32",
"possibly_sensitive": "bool",
"replied_tweet_id": "object",
"reply_count": "int32",
"quoted_status_id": "object",
"quote_count": "int32",
"sentiment_score": "float32",
}

In [4]:
path = os.path.join(
        os.path.dirname(
            os.getcwd()
        ),
    "data_processed", "local_backup.db")
test_data = get_local_data(QUERY_ALL, path)

In [5]:
test_data

Unnamed: 0_level_0,user_id,user_creation_time,verified,followers_count,friends_count,statuses_count,default_profile,default_profile_image,tweet_creation_time,full_text,lang,country_code,favorite_count,retweet_count,possibly_sensitive,replied_tweet_id,reply_count,quoted_status_id,quote_count,sentiment_score
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1131172858951024641,393374091,2011-10-18 12:55:25+00:00,True,44323,845,73224,False,False,2019-05-22 12:20:00+00:00,La ruta de easyJet entre Londres y Menorca tra...,es,un,0,0,False,,0,,0,-0.037224
1130922003702177800,880417607865815040,2017-06-29 13:28:09+00:00,False,2025,2541,22517,True,False,2019-05-21 19:43:11+00:00,@goody_tracy Here’s a list of some of @JonesDa...,en,un,23,33,False,1130615560910254080,2,,3,-0.045324
1131172864147808257,3420691215,2015-08-13 19:18:07+00:00,False,1260,1468,38581,True,False,2019-05-22 12:20:01+00:00,RT @bttr_as1: @goody_tracy Here’s a list of so...,en,un,0,0,False,,0,,0,-0.051741
1131172867985485824,394376606,2011-10-20 00:02:49+00:00,False,92,215,385,True,False,2019-05-22 12:20:02+00:00,@British_Airways,und,un,0,0,False,1131032916232826881,0,,0,-0.033292
1131030279278063616,227687574,2010-12-17 14:37:53+00:00,False,34198,1605,17701,False,False,2019-05-22 02:53:26+00:00,Nice change by @AmericanAir. Bikes now pay sta...,en,un,287,32,False,,11,,15,-0.047510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244696703690772485,278698748,2011-04-07 19:55:35+00:00,False,1187,1635,48146,False,False,2020-03-30 18:43:14+00:00,RT @jfergo86: Me parece a mí o el avión es más...,es,un,0,0,False,,0,1244398934522392576,0,-0.386010
1244696708983984131,246520593,2011-02-02 23:06:38+00:00,False,32,53,672,True,False,2020-03-30 18:43:15+00:00,Today’s random pic of the day is the one of Vo...,en,un,0,0,False,,0,,0,0.872379
1244696710447800320,109284383,2010-01-28 15:09:19+00:00,False,33,689,1460,False,False,2020-03-30 18:43:15+00:00,RT @SchipholWatch: @spbverhagen @markduursma @...,nl,un,0,0,False,,0,,0,-0.553437
1244696713350217728,1223576386432126976,2020-02-01 11:59:19+00:00,False,182,411,3798,True,False,2020-03-30 18:43:16+00:00,RT @wiltingklaas: Tweede Kamer stemt over vlie...,nl,un,0,0,False,,0,,0,-0.043661


In [6]:
convo_special = test_data[["user_id", "replied_tweet_id", "tweet_creation_time"]]
convo_special

Unnamed: 0_level_0,user_id,replied_tweet_id,tweet_creation_time
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1131172858951024641,393374091,,2019-05-22 12:20:00+00:00
1130922003702177800,880417607865815040,1130615560910254080,2019-05-21 19:43:11+00:00
1131172864147808257,3420691215,,2019-05-22 12:20:01+00:00
1131172867985485824,394376606,1131032916232826881,2019-05-22 12:20:02+00:00
1131030279278063616,227687574,,2019-05-22 02:53:26+00:00
...,...,...,...
1244696703690772485,278698748,,2020-03-30 18:43:14+00:00
1244696708983984131,246520593,,2020-03-30 18:43:15+00:00
1244696710447800320,109284383,,2020-03-30 18:43:15+00:00
1244696713350217728,1223576386432126976,,2020-03-30 18:43:16+00:00


In [7]:
conversations = extract_and_filter_conversations(convo_special)

Extracting all conversations:   0%|          | 0/1795409 [00:00<?, ?it/s]

In [8]:
conversations

[['1244694453190897664', '1244696682979303426'],
 ['1244677304598609923', '1244696641401163776'],
 ['1244648694454026240',
  '1244684854316367872',
  '1244694204565139459',
  '1244695385978867713',
  '1244696494881542144'],
 ['1244344799647449089', '1244696491580628993'],
 ['1244593729312362497', '1244696406570475525'],
 ['1244644204132909060', '1244696371900436481'],
 ['1242875007270891523', '1244696352090656770'],
 ['1244648694454026240',
  '1244684854316367872',
  '1244694204565139459',
  '1244695385978867713',
  '1244696333673521158'],
 ['1244663027452071936', '1244696298638450696'],
 ['1244584879095939073', '1244696279806087172'],
 ['1244693824519184392', '1244696279197847555'],
 ['1244550514970329088', '1244553548668579852', '1244696257781805056'],
 ['1243532085131743232', '1244696235019304960'],
 ['1244648694454026240',
  '1244684854316367872',
  '1244694204565139459',
  '1244695385978867713',
  '1244696230875336712'],
 ['1244683000195022855', '1244696213552758787'],
 ['12446916

In [9]:
data = []
for convo_num, convo in enumerate(conversations, start=1):
    data.extend((convo_num, tweet_id) for tweet_id in convo)
# Create a DataFrame
df_conversations = pd.DataFrame(data, columns=['Conversation', 'Tweet_ID'])

# Set MultiIndex
df_conversations

Unnamed: 0,Conversation,Tweet_ID
0,1,1244694453190897664
1,1,1244696682979303426
2,2,1244677304598609923
3,2,1244696641401163776
4,3,1244648694454026240
...,...,...
2712242,1064150,451125255294443521
2712243,1064151,430790355962052608
2712244,1064151,430792524043931648
2712245,1064152,248528541157834752


In [10]:

# Merge the conversation DataFrame with the test_data DataFrame
df_conversations_full = df_conversations.merge(test_data, left_on='Tweet_ID',
                                               right_index=True, how='left')

# Set the MultiIndex again with Conversation and Tweet_ID
# df_conversations_full.set_index(['Conversation', 'Tweet_ID'], inplace=True)
df_conversations_full

Unnamed: 0,Conversation,Tweet_ID,user_id,user_creation_time,verified,followers_count,friends_count,statuses_count,default_profile,default_profile_image,...,lang,country_code,favorite_count,retweet_count,possibly_sensitive,replied_tweet_id,reply_count,quoted_status_id,quote_count,sentiment_score
0,1,1244694453190897664,521835883,2012-03-12 01:11:22+00:00,False,172,330,3511,True,False,...,en,un,0,0,False,1243885949697888263,0,,0,0.292771
1,1,1244696682979303426,20626359,2009-02-11 20:50:56+00:00,True,598992,358,202940,False,False,...,en,un,0,0,False,1244694453190897664,0,,0,0.190686
2,2,1244677304598609923,396021583,2011-10-22 16:35:05+00:00,False,288,556,9657,True,False,...,en,un,0,0,False,1244669964289806338,0,,0,-0.909571
3,2,1244696641401163776,832964639436701696,2017-02-18 14:47:00+00:00,False,9,86,370,True,False,...,en,un,0,0,False,1244677304598609923,0,,0,0.058692
4,3,1244648694454026240,1233410199500791809,2020-02-28 15:14:56+00:00,False,28,430,573,True,False,...,es,un,0,0,False,1244643427515535360,0,,0,-0.442713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712242,1064150,451125255294443521,22536055,2009-03-02 21:23:05+00:00,True,1556816,106922,2096661,False,False,...,en,un,3,1,False,451124070730719233,0,,0,0.246197
2712243,1064151,430790355962052608,64327804,2009-08-10 03:34:27+00:00,False,217,573,5675,True,False,...,en,un,0,1,False,,1,,0,0.155998
2712244,1064151,430792524043931648,22536055,2009-03-02 21:23:05+00:00,True,1556816,106922,2096661,False,False,...,en,un,0,1,False,430790355962052608,0,,0,0.486518
2712245,1064152,248528541157834752,19911051,2009-02-02 15:17:02+00:00,True,246933,789,18849,False,False,...,en,un,1,5,False,,6,,0,-0.975850


In [11]:
# Group by 'Conversation' and 'user_id' to get the first and last sentiment scores
grouped = df_conversations_full.groupby(['Conversation', 'user_id'])

# Calculate the first and last sentiment scores
first_last_sentiments = grouped['sentiment_score'].agg(['first', 'last']).reset_index()

# Calculate the change in sentiment score
first_last_sentiments['change_in_sentiment'] = first_last_sentiments['last'] - first_last_sentiments['first']
# df_conversations_full = df_conversations_full.merge(first_last_sentiments[['Conversation', 'user_id', 'change_in_sentiment']], on=['Conversation', 'user_id'], how='left')
first_last_sentiments[["Conversation", "user_id", "change_in_sentiment"]]

Unnamed: 0,Conversation,user_id,change_in_sentiment
0,1,20626359,0.000000
1,1,521835883,0.000000
2,2,396021583,0.000000
3,2,832964639436701696,0.000000
4,3,1233410199500791809,0.468702
...,...,...,...
2128299,1064150,701977520,0.000000
2128300,1064151,22536055,0.000000
2128301,1064151,64327804,0.000000
2128302,1064152,19911051,0.000000


In [12]:
average_sentiment_per_user = df_conversations_full.groupby(['Conversation', 'user_id']).agg(
    tweets_number=("sentiment_score", "count"),
    average_sentiment=("sentiment_score", "mean"),
    min_sentiment=("sentiment_score", "min"),
    max_sentiment=("sentiment_score", "max"),
).reset_index()

# Merge first_last_sentiments with average_sentiment_per_user
merged_df = pd.merge(average_sentiment_per_user, first_last_sentiments[['Conversation', 'user_id', 'change_in_sentiment']],
                     on=['Conversation', 'user_id'], how='left')

In [13]:
test = merged_df.set_index(["Conversation", "user_id"])

In [17]:
test.query("tweets_number > 2")

Unnamed: 0_level_0,Unnamed: 1_level_0,tweets_number,average_sentiment,min_sentiment,max_sentiment,change_in_sentiment
Conversation,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,1233410199500791809,3,-0.202300,-0.442713,0.025988,0.468702
8,1233410199500791809,3,-0.198697,-0.442713,0.036797,0.479510
14,1233410199500791809,3,-0.495133,-0.852509,-0.190175,-0.409796
22,20626359,4,0.101074,-0.370357,0.632406,0.213359
22,258132793,4,-0.794619,-0.879518,-0.726897,0.057900
...,...,...,...,...,...,...
1063759,5404442,3,-0.030344,-0.041435,-0.011450,0.026698
1063992,18332190,3,0.714224,0.359253,0.940507,-0.483659
1064112,18332190,4,-0.161716,-0.324632,-0.016905,-0.058002
1064112,29837073,4,0.478334,0.080313,0.684486,-0.392765
