In [30]:
import pandas as pd
from datasketch import MinHash, MinHashLSH
import re
import csv
import string
from tqdm import tqdm
tqdm.pandas(mininterval=1.0)
from TweetNormalizer import normalizeTweet
import spacy


In [31]:
df = pd.read_csv('sample_200k-130k_text.csv')  # Assuming the texts are in a column named 'text'
original_df = pd.read_csv('sample_200k-130k_full.csv')
# Check the first few rows to inspect the data and find the column with texts
df["text"].to_list()

['📊Interesting chart: we can clearly see how long-term holders been accumulating Bitcoins during the Summer-Autumn dump.   At the moment total supply held by holders is at almost All Time High.#Btc #100xgem #cryptocurrencies #HODL #ToTheMoon https://t.co/NusqceFbnv',
 "Bitcoin's 'Elon Musk pump' rally to $48K was exclusively driven by whales https://t.co/F6zGnXnBh0 Laranjal do Elon",
 "#Bitcoin over $50K this morning...is this the #DeadCatBounce that will go parabolic? Yesterday, $SQ announced they purchased Bitcoin...did #TSLA buy more in the 40s too? I continue to be bullish on Bitcoin which is why I'm bullish on $MARA, $RIOT, $MSTR, and $TSLA.",
 'Who cares about China’s ban on #Bitcoin.   Can’t ban crypto - moving on....',
 'bitcoin tries to take direction again. ⬆️⬇️ I think after some correction it will be up again. $btc #btc 💬',
 'The # Skyllz crew is creating the blockchain based and distributed skill validation platform to showcase track and boost human talents acro',
 'Saying

In [32]:
df["text"] = df["text"].progress_apply(normalizeTweet)

100%|██████████| 129991/129991 [00:29<00:00, 4450.16it/s]


In [33]:
df["text"].to_list()

[':bar_chart: Interesting chart : we can clearly see how long-term holders been accumulating Bitcoins during the Summer-Autumn dump . At the moment total supply held by holders is at almost All Time High . #Btc #100xgem #cryptocurrencies #HODL #ToTheMoon HTTPURL',
 "Bitcoin 's ' Elon Musk pump ' rally to $ 48K was exclusively driven by whales HTTPURL Laranjal do Elon",
 "#Bitcoin over $ 50K this morning ... is this the #DeadCatBounce that will go parabolic ? Yesterday , $ SQ announced they purchased Bitcoin ... did #TSLA buy more in the 40s too ? I continue to be bullish on Bitcoin which is why I 'm bullish on $ MARA , $ RIOT , $ MSTR , and $ TSLA .",
 "Who cares about China 's ban on #Bitcoin . Ca n't ban crypto - moving on ...",
 'bitcoin tries to take direction again . :up_arrow: :down_arrow: I think after some correction it will be up again . $ btc #btc :speech_balloon:',
 'The # Skyllz crew is creating the blockchain based and distributed skill validation platform to showcase trac

In [34]:
def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text.lower()  # Convert text to lowercase

In [35]:
df['text'] = df['text'].apply(preprocess)  # Preprocess all texts

In [36]:
df["text"].to_list()

['barchart interesting chart  we can clearly see how longterm holders been accumulating bitcoins during the summerautumn dump  at the moment total supply held by holders is at almost all time high  btc 100xgem cryptocurrencies hodl tothemoon httpurl',
 'bitcoin s  elon musk pump  rally to  48k was exclusively driven by whales httpurl laranjal do elon',
 'bitcoin over  50k this morning  is this the deadcatbounce that will go parabolic  yesterday   sq announced they purchased bitcoin  did tsla buy more in the 40s too  i continue to be bullish on bitcoin which is why i m bullish on  mara   riot   mstr  and  tsla ',
 'who cares about china s ban on bitcoin  ca nt ban crypto  moving on ',
 'bitcoin tries to take direction again  uparrow downarrow i think after some correction it will be up again   btc btc speechballoon',
 'the  skyllz crew is creating the blockchain based and distributed skill validation platform to showcase track and boost human talents acro',
 'saying bitcoin is nt real b

In [37]:
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase and remove extra spaces
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text

In [38]:
df['text'] = df['text'].apply(preprocess_text)  # Preprocess all texts

In [39]:
# Load the spaCy model (make sure to install it first: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm")

In [40]:
def nlp_preprocess(text):
    doc = nlp(text)

    # Remove tokens that are recognized as dates, times, or cardinal numbers
    filtered_tokens = [token.text for token in doc if token.ent_type_ not in {"DATE", "TIME", "CARDINAL", "MONEY", "PERCENT", "QUANTITY", "ORDINAL"}]

    # Reconstruct the cleaned text
    return " ".join(filtered_tokens)

In [41]:
df['text'] = df['text'].progress_apply(nlp_preprocess)  # Preprocess all texts

100%|██████████| 129991/129991 [13:00<00:00, 166.62it/s]


In [42]:
df["text"].to_list()

['barchart interesting chart we can clearly see how longterm holders been accumulating bitcoins during the summerautumn dump at the moment total supply held by holders is at almost all time high btc xgem cryptocurrencies hodl tothemoon httpurl',
 'bitcoin s elon musk pump rally to k was exclusively driven by whales httpurl laranjal do elon',
 'bitcoin over k is this the deadcatbounce that will go parabolic sq announced they purchased bitcoin did tsla buy more in the s too i continue to be bullish on bitcoin which is why i m bullish on mara riot mstr and tsla',
 'who cares about china s ban on bitcoin ca nt ban crypto moving on',
 'bitcoin tries to take direction again uparrow downarrow i think after some correction it will be up again btc btc speechballoon',
 'the skyllz crew is creating the blockchain based and distributed skill validation platform to showcase track and boost human talents acro',
 'saying bitcoin is nt real because it s not tangible is like saying the internet is nt r

In [46]:
def find_similar_tweets(output_path, threshold=0.7, num_perm=128):
    """Find and remove similar tweets from a CSV file"""
    
    texts = df['text'].tolist()
    
    # Create LSH index
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    
    # Create MinHash objects and index them
    minhashes = []
    for idx, text in enumerate(texts):
        # Preprocess text
        cleaned = preprocess_text(text)
        
        # Create MinHash
        m = MinHash(num_perm=num_perm)
        # Use 3-grams to capture phrases
        tokens = cleaned.split()
        for i in range(len(tokens) - 2):
            m.update(' '.join(tokens[i:i+3]).encode('utf-8'))
        
        # Store MinHash and add to LSH
        minhashes.append(m)
        lsh.insert(idx, m)
    
    # Find duplicates
    duplicates = set()
    for idx in range(len(texts)):
        # Find similar items
        result = lsh.query(minhashes[idx])
        # Remove self-match and keep first occurrence
        matches = [i for i in result if i != idx]
        if matches:
            duplicates.update(matches)
    
    # Filter out duplicates, keeping first occurrences
    unique_indices = [i for i in range(len(texts)) if i not in duplicates]
    unique_df = original_df.iloc[unique_indices]
    
    # Save results
    unique_df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL)
    print(f"Original count: {len(original_df)}")
    print(f"Unique count: {len(unique_df)}")
    print(f"Removed duplicates: {len(original_df) - len(unique_df)}")

In [47]:
find_similar_tweets('sample_200k-130k_duplicates2.csv', threshold=0.9)

KeyboardInterrupt: 

In [45]:
Original count: 129991
Unique count: 109242
Removed duplicates: 20749

SyntaxError: invalid syntax (2807752422.py, line 1)

In [None]:
filename = 'sample_200k-130k_duplicates2.csv'
unsorted_df = pd.read_csv(filename)
unsorted_df = unsorted_df.reset_index(drop=True)
unsorted_df["id"] = unsorted_df.index
unsorted_df.to_csv(filename, index=False)