# Notebook to create temporal similarity based on the data in the MongoDB database

### Handle imports

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import datetime
from matplotlib import pyplot as plt
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from dtaidistance import dtw

from tqdm.notebook import tqdm

### Function to extract relevant users from the clusters in the textClust mongoDB database

In [None]:
def extract_relevant_users_from_clusters(source_uuid, cluster_id, timestamp):
    connection = MongoClient(f"mongodb://localhost:27017/")
    db = connection.textclustDB
    
    # Extract all tweets of a cluster from the MongoDB database
    textids = db[f"mc_{source_uuid}"].find_one(
        {"id": cluster_id},
        sort=[("timestamp", -1)],
        projection={
            "_id": 0,
            "textids": 1
            }
    )
    
    # Extract the relevant users
    users = db[f"texts_{source_uuid}"].find(
        {
            "$and": [
                {"general.text_id": {
                        "$in": textids["textids"]
                    }
                },
                {"$or": [
                    {"general.time": {
                        "$lte": datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")
                        }
                    },
                    {"general.time": {
                        "$lte": timestamp.replace("T", " ")
                        }
                    }
                ]}
            ]
        },
        sort=[("general.time", -1)],
        projection = {
            "_id": 0,
            "user": "$specific.user"
        }
    ).limit(5000)
    users = pd.DataFrame([user['user'] for user in users])
    return users.drop_duplicates(["id_str"], ignore_index=True)


In [None]:
users = extract_relevant_users_from_clusters("8273444c-abdd-4410-829a-970846ebd00e", 52525, "2022-02-25T22:41:49")

## Approach to use the MongoDB database to acquire the tweets

### Load last tweets of the users in the cluster from the MongoDB database

In [None]:
def extract_tweets_per_user(source_uuid, user, timestamp):
    connection = MongoClient(f"mongodb://localhost:27017/")
    db = connection.textclustDB
    data =  db[f"texts_{source_uuid}"].find(
        {"$and": [
            {"specific.user.id": user['id']}, 
            {"$or": [
                    {"general.time": {
                        "$lte": datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")
                        }
                    },
                    {"general.time": {
                        "$lte": timestamp.replace("T", " ")
                        }
                    }
                ]}
        ]},
        projection={
            "_id": 0,
            "user_screen_name": "$specific.user.screen_name",
            "user_id": "$specific.user.id_str",
            "id": "$specific.user.id",
            "text": "$general.text",
            "created_at": "$general.time"
        }
        )
    return pd.DataFrame(list(data))

### Execute the method for every user

In [None]:
tweets = pd.DataFrame(columns=["user_screen_name", "user_id", "id", "text", "created_at"])

for _, user in tqdm(users.iterrows(), total=len(users)):
    response = extract_tweets_per_user("8273444c-abdd-4410-829a-970846ebd00e", user, "2022-02-25T22:41:49")
    tweets = pd.concat([tweets, response], ignore_index=True)

### Adapt timestamp types

In [None]:

tweets['created_at'] = tweets['created_at'].values.astype('datetime64[m]')
tweet = tweets.astype({'created_at': 'datetime64[m]'})

### Create 24 hour time frame based on the date of the newest tweet

In [None]:
start = tweets["created_at"].max()
end = start - datetime.timedelta(days=1)
tweets = tweets[tweets["created_at"] > end]

### Create Dataframe representing the user behavior as a time-series with 1 minute steps

In [None]:
def create_tweet_time_series_for_user(user_df, end_timestamp, start_timestamp):
    end = end_timestamp.replace(second=0, microsecond=0)
    start = start_timestamp.replace(second=0, microsecond=0)
    date_ranges = pd.date_range(start=start, end=end, freq='1min')
    bins = pd.cut(user_df['created_at'], bins=date_ranges, right=False, labels=[x for x in range(0,len(date_ranges)-1)])
    groups = user_df.groupby(['user_screen_name', bins])
    return groups.size().unstack()

In [None]:
time_series = create_tweet_time_series_for_user(tweets, tweets["created_at"].max(), tweets["created_at"].min())

# Filter users with less than 10 tweets in a timespan of 1 day
# This is done because users with only a few tweets will have a low distance
# to other users as their are not many warping operations needed
time_series = time_series[time_series.sum(axis=1) > 9]

### Create similarity matrix for users with DTW

In [None]:
def calculate_dtw_distance(x, y):
    distance = dtw.distance(x.astype('double'), y.astype('double'), window=2, use_c=True)
    return distance

In [None]:
result = pd.DataFrame(squareform(pdist(time_series, metric=calculate_dtw_distance)), columns=time_series.index.values, index=time_series.index.values)

### Transform distances into similarities

In [None]:
max(result.values.flatten())
np.fill_diagonal(result.values, max(result.values.flatten()))
similarity = 1 - result / max(result.values.flatten())


### Plot the timeseries of users in the dataframe

In [None]:
time_series.loc['XXX'].plot(figsize=(8,5), xlabel="Time bins in minutes", fontsize=14)
plt.xlabel('Time bins in minutes', fontsize=14)

In [None]:
time_series.loc['XXX'].plot(figsize=(8,5), xlabel="Time bins in minutes", fontsize=14)
plt.xlabel('Time bins in minutes', fontsize=14)

### Create similarity graph

In [None]:
G = nx.from_pandas_adjacency(similarity)

F = G.copy()
threshold = 0.9
F.remove_edges_from([(n1, n2) for n1, n2, w in F.edges(data="weight") if w < threshold])
F.remove_nodes_from(list(nx.isolates(F)))
fig = plt.figure(1, figsize=(30, 20), dpi=60)
nx.draw(F, with_labels=True, node_size=1000, font_size=24)
plt.show()


### Sample random edge

This is good for getting two connected users in the graph to inspect their profiles manually

In [None]:
import random
random.sample(F.edges(), 1)