In [1]:
import astrodetection
import glob
import pandas as pd
import os
import numpy as np

In [None]:
path = "PATH_TO_FOLDER"

In [3]:
df_dict = {}
for file in glob.glob(f"{path}/*.json"):
    df = pd.read_json(file)
    df.index = df.index.astype(str) #compatibility with d3lta
    n_file = os.path.split(file)[-1]
    df_dict[n_file] = df

In [4]:
def preprocess_df(df):

  df = df[df['tweet'].str.len() > 100]
  df = df[df['username']!='grok']
  df.index = df.index.astype(str)

  return df

In [5]:
for n_file, df in df_dict.items():
    df_dict[n_file] = preprocess_df(df)

In [7]:
dict_matches = {}
dict_scores = {}

for n_file, df in df_dict.items():
    df_filtered, df_emb = astrodetection.prepare_input_data(df, 
        embeddings=df['emb'])
    
    matches, df_cluster = astrodetection.semantic_faiss(
        df_filtered.rename(columns={'tweet':'original'}),
        min_size_txt = 0,
        df_embeddings_use = df_emb,
        threshold_grapheme = 0.8,
        threshold_language = 0.715,
        threshold_semantic = 0.9
    )

    dict_matches[n_file] = matches

    dict_scores[n_file] = astrodetection.compute_bot_likelihood_metrics(
        df,
        matches=matches
    )

>>> Start prepare_dataset
Done.

Removing 0 short texts over 3464 sentences...
Done.
<<< End prepare_dataset, Took: 9.7231 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0143 sec
>>> Start find_matches


  0%|          | 0/35 [00:00<?, ?it/s]

<<< End find_matches, Took: 3.1344 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 1.2434 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 4270 sentences...
Done.
<<< End prepare_dataset, Took: 11.6766 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0118 sec
>>> Start find_matches


  0%|          | 0/43 [00:00<?, ?it/s]

<<< End find_matches, Took: 9.3723 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 4.5779 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 3463 sentences...
Done.
<<< End prepare_dataset, Took: 9.9415 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0043 sec
>>> Start find_matches


  0%|          | 0/35 [00:00<?, ?it/s]

<<< End find_matches, Took: 0.7422 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.0688 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 611 sentences...
Done.
<<< End prepare_dataset, Took: 2.1362 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0017 sec
>>> Start find_matches


  0%|          | 0/7 [00:00<?, ?it/s]

<<< End find_matches, Took: 1.5081 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.9705 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 1364 sentences...
Done.
<<< End prepare_dataset, Took: 4.4585 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0040 sec
>>> Start find_matches


  0%|          | 0/14 [00:00<?, ?it/s]

<<< End find_matches, Took: 7.8111 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 3.1419 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 1406 sentences...
Done.
<<< End prepare_dataset, Took: 4.9047 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0079 sec
>>> Start find_matches


  0%|          | 0/15 [00:00<?, ?it/s]

<<< End find_matches, Took: 1.3183 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.7227 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 220 sentences...
Done.
<<< End prepare_dataset, Took: 0.6524 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0004 sec
>>> Start find_matches


  0%|          | 0/3 [00:00<?, ?it/s]

<<< End find_matches, Took: 0.2661 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.0417 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 917 sentences...
Done.
<<< End prepare_dataset, Took: 2.5723 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0011 sec
>>> Start find_matches


  0%|          | 0/10 [00:00<?, ?it/s]

<<< End find_matches, Took: 1.3378 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.6104 sec
>>> Start prepare_dataset
Done.

Removing 0 short texts over 2153 sentences...
Done.
<<< End prepare_dataset, Took: 5.2120 sec
>>> Start create_index_cosine
<<< End create_index_cosine, Took: 0.0056 sec
>>> Start find_matches


  0%|          | 0/22 [00:00<?, ?it/s]

<<< End find_matches, Took: 0.6525 sec
>>> Start compute_duplicate_types
<<< End compute_duplicate_types, Took: 0.1280 sec


In [8]:
df_scores = pd.DataFrame.from_dict(dict_scores, orient='index')

In [9]:
def colora_cella(val, valore_riferimento):
    if pd.isna(val):
        return ''
    if val > 2 * valore_riferimento:
        return 'background-color: salmon'
    elif val > valore_riferimento:
        return 'background-color: yellow'
    else:
        return 'background-color: lightgreen'


valori_riferimento = {}
for col in df_scores.columns:
    valori_riferimento[col] = max(df_scores.loc['lundi_semaine.json', col], df_scores.loc['OmarSy.json', col])

def applica_colorazione(row):
    styled_row = []
    for col, val in row.items():
        if col != 'number_of_tweets':
          styled_row.append(colora_cella(val, valori_riferimento[col]))
        else:
          styled_row.append('')
    return styled_row

df_scores_colorato = df_scores.style.apply(applica_colorazione, axis=1)

In [10]:
df_scores_colorato

Unnamed: 0,copypasta_score (%),top_users_post_percent (%),top_users_count,zero_followers_and_following (%),no_image_and_description (%),default_handle_score (%),number_of_tweets
punaises_de_lit.json,9.5,5.66,29,0.06,5.28,8.34,3464
penurie.json,12.95,12.15,25,0.42,1.87,5.48,4270
lundi_semaine.json,4.79,5.31,30,0.09,2.11,7.62,3463
ZFE.json,40.43,11.13,5,0.16,1.8,4.75,611
trogneux_1000.json,23.02,6.16,12,0.22,3.96,12.76,1364
OmarSy.json,9.96,7.75,10,0.21,1.49,4.77,1406
elnet.json,15.45,12.27,2,0.45,4.09,15.45,220
cadmium_long.json,23.56,7.85,7,0.0,2.84,11.78,917
shogun.json,9.57,23.78,10,0.05,0.88,6.18,2153


In [11]:
astrodetection.create_network(dict_matches['cadmium_long.json'], df_dict['cadmium_long.json'])

Sigma(nx.DiGraph with 216 nodes and 479 edges)