In [None]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import paired_cosine_distances

tqdm.pandas()
nltk.download('stopwords')

def preprocess_text(text):
    tokens = text.lower().split()
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(tokens)

In [None]:
dfs = [pd.read_parquet('./parallel_shards/'+f) for f in sorted(os.listdir('parallel_shards/'))]
df = pd.concat(dfs)
df_pairs= df.merge(df, on=['username','url'], how='inner')
df_pairs= df_pairs[df_pairs['id_x']<df_pairs['id_y']]
idx_map = {v:i for i,v in enumerate(df['id'])}
xidx = df_pairs['id_x'].apply(lambda x: idx_map[x])
yidx = df_pairs['id_y'].apply(lambda x: idx_map[x])

In [None]:
tweet_sample = df.groupby('username', group_keys=False).apply(lambda x: x.sample(min(len(x), 300)))
preprocessed_tweets = tweet_sample['clean_tweet'].progress_apply(preprocess_text)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_tweets)

In [None]:
text = df['clean_tweet'].progress_apply(lambda x: preprocess_text(x))
text = vectorizer.transform(text)
x,y = text[xidx], text[yidx]
df_pairs['tfidf_similarity'] = paired_cosine_distances(x, y)
df_pairs[['tfidf_similarity', 'id_x', 'id_y', 'url', 'username']].to_parquet('tfidf.parquet')