In [62]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
model_name = "svalabs/twitter-xlm-roberta-crypto-spam"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [21]:
import torch.nn.functional as F

def is_spam(tweet):
    """Returns True if the tweet is classified as spam, otherwise False."""
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU if available

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)  # Convert logits to probabilities

    spam_prob = probs[0][1].item()  # Probability of spam class
    return spam_prob > 0.5  # Threshold (adjust if needed)

In [4]:
from tweetnlp import Emotion
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, first, count, lit, date_format, to_timestamp, input_file_name, regexp_replace, year, to_date, pandas_udf, rand
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, BooleanType, ArrayType, LongType
from tqdm import tqdm   # Library for making progress bars
import pandas as pd
import fasttext

2025-03-19 16:52:48,992	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-19 16:52:49,513	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [5]:
spark = SparkSession.builder \
    .appName("LanguageDetection") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.network.timeout", "600s") \
    .getOrCreate()

initialDF = spark.read.option("recursiveFileLookup", "true").csv(
    "10_000_tweet_subset.csv", 
    header=True,  # Use the first row as column names
    inferSchema=True,  # Infer data types
    multiLine=True,  # Handle newlines within fields
    escape='\\',  # Escape character for double quotes
    quote='"',  # Define the quote character
    sep=",",  # Specify the correct delimiter
    mode="PERMISSIVE"  # Handle malformed rows gracefully
)

In [6]:
initialDF.count()

10000

In [40]:
df = initialDF.toPandas()

In [41]:
fasttext_model = fasttext.load_model("lid.176.ftz")

def is_english(text):
    # fastText returns predictions with a label like '__label__en'
    labels, _ = fasttext_model.predict(text)
    return labels[0] == '__label__en'



In [42]:
tqdm.pandas(mininterval=1.0)  # Enable progress bar for Pandas

In [44]:
df["text"] = df["text"].str.replace("\n", " ")

In [45]:
df = df[df["text"].progress_apply(is_english)]

100%|██████████| 10000/10000 [00:00<00:00, 19660.90it/s]


In [46]:
len(df)

8189

In [None]:
from transformers import pipeline

# Load spam detection model
spam_classifier = pipeline(
    "text-classification",
    model="mrm8488/bert-tiny-finetuned-sms-spam-detection",
    tokenizer="mrm8488/bert-tiny-finetuned-sms-spam-detection",
    truncation=True,  # Ensures text is cut to fit within 512 tokens
    max_length=512     # Explicitly sets the max token length
)

In [56]:
df['text'].head(100).tolist()

['# Bitcoin is becoming a nightmare for divorce lawyers http://j.mp/2o5QyOnpic.twitter.com/hPHdKOMvpv',
 'Altcoins See Red as Bitcoin Capitulation Drags Down Market https://t.co/YLetpV6ChA',
 "@Crypto_DashXRP This is where we have to be subjective to information we receive. Because they only want to cherry pick one point in time to justify their article. Look at the big picture and see if you see any correlation over the span of BTC's history with DJI and SPX. Which is almost none. https://t.co/Wp4qLnO61L",
 '#BTC #BTCFX #XRP #XEM #BNB #ETH #LTC #MONA #LSK #FCT #BCH #WAVES #ADA #TRX #BTCB #BCHABC #DASH #XMR #ZEC #DCR #DGD #BTG #REP #Libra #NEO #XZC #ZEN #ETC #EOS #MCO #ATOM #QTUM #LINK #HC #GAS #GXS #WTC #GVT #OMG  こちらのURLから登録で取引手数料6ヶ月間10%割! https://t.co/vDVXxogr6W https://t.co/oRsQ0ot8WA',
 '$GXS. Push! Keep on rising! $BTC market on #Binance. Current Price: Ƀ 0.00019160 Sharing = Pushing!',
 '@PENTA_live https://t.co/h6nsurCNJQ #wonderland100m #Bitcoin #Ethereum #crytocurrency #MATIC

In [74]:
# For testing to go faster I will get a smaller df
df_300 = df.head(300)
df_300

Unnamed: 0,date,id,text,hashtags,is_retweet,retweets,likes,replies,language,user_name,user_id,user_location,prediction
2,2018-02-16 17:09:00,964532112878551040,# Bitcoin is becoming a nightmare for divorce ...,,,7.0,15.0,,,,,,LABEL_1
3,2019-11-23 00:11:45,1198016243963432960,Altcoins See Red as Bitcoin Capitulation Drags...,,,0.0,0.0,0.0,,Penny Crypto Club,PennyCryptoClub,,LABEL_1
4,2019-11-17 22:10:50,1196173874674552832,@Crypto_DashXRP This is where we have to be su...,,,0.0,0.0,0.0,,Cid Vicious,CidVicious2,,LABEL_1
6,2019-11-01 11:41:46,1190217360285691904,#BTC #BTCFX #XRP #XEM #BNB #ETH #LTC #MONA #LS...,,,0.0,0.0,0.0,,仮想都市,458ny,,LABEL_1
7,2019-06-30 00:30:35,1145097255667396608,$GXS. Push! Keep on rising! $BTC market on #Bi...,,,0.0,0.0,0.0,,The Pump Advisor,thepumpadvisor,,LABEL_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2018-01-16 21:13:00,953359509547872257,Is Global Front on Bitcoin Regulation Possible...,,,0.0,0.0,,,,,,LABEL_1
349,2019-07-09 13:19:45,1148552313201545216,@CCNMarkets #securypto so it's time to conside...,,,0.0,0.0,0.0,,13,mr_name13,,LABEL_0
350,2017-11-23 17:29:00,933734168432869376,Could Milton Friedmans Least Bad Tax Work for ...,,,1.0,0.0,,,,,,LABEL_1
351,2019-05-10 12:19:11,1126823998971699200,New to Elliott Wave ? Then get started and rea...,,,0.0,0.0,0.0,,AlienOvichO,AlienOvichO,,LABEL_1


In [78]:
# Classify tweets
df_300["prediction"] = df_300["text"].progress_apply(lambda x: spam_classifier(x)[0]["label"])

# Keep only non-spam tweets
df_filtered = df_300[df_300["prediction"] == "LABEL_0"].drop(columns=["prediction"])
df_spam = df_300[df_300["prediction"] == "LABEL_1"].drop(columns=["prediction"])

100%|██████████| 300/300 [00:01<00:00, 194.97it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_300["prediction"] = df_300["text"].progress_apply(lambda x: spam_classifier(x)[0]["label"])


In [79]:
df_300

Unnamed: 0,date,id,text,hashtags,is_retweet,retweets,likes,replies,language,user_name,user_id,user_location,prediction
2,2018-02-16 17:09:00,964532112878551040,# Bitcoin is becoming a nightmare for divorce ...,,,7.0,15.0,,,,,,LABEL_1
3,2019-11-23 00:11:45,1198016243963432960,Altcoins See Red as Bitcoin Capitulation Drags...,,,0.0,0.0,0.0,,Penny Crypto Club,PennyCryptoClub,,LABEL_1
4,2019-11-17 22:10:50,1196173874674552832,@Crypto_DashXRP This is where we have to be su...,,,0.0,0.0,0.0,,Cid Vicious,CidVicious2,,LABEL_1
6,2019-11-01 11:41:46,1190217360285691904,#BTC #BTCFX #XRP #XEM #BNB #ETH #LTC #MONA #LS...,,,0.0,0.0,0.0,,仮想都市,458ny,,LABEL_1
7,2019-06-30 00:30:35,1145097255667396608,$GXS. Push! Keep on rising! $BTC market on #Bi...,,,0.0,0.0,0.0,,The Pump Advisor,thepumpadvisor,,LABEL_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2018-01-16 21:13:00,953359509547872257,Is Global Front on Bitcoin Regulation Possible...,,,0.0,0.0,,,,,,LABEL_1
349,2019-07-09 13:19:45,1148552313201545216,@CCNMarkets #securypto so it's time to conside...,,,0.0,0.0,0.0,,13,mr_name13,,LABEL_0
350,2017-11-23 17:29:00,933734168432869376,Could Milton Friedmans Least Bad Tax Work for ...,,,1.0,0.0,,,,,,LABEL_1
351,2019-05-10 12:19:11,1126823998971699200,New to Elliott Wave ? Then get started and rea...,,,0.0,0.0,0.0,,AlienOvichO,AlienOvichO,,LABEL_1


In [82]:
df_filtered['text'].head(100).tolist()

['Any cryptocurrency experts out there? Have I missed the # Bitcoin bubble?',
 'Y is it @ EmilyKDrewry # Forbes is starting to Sound more like # EntertainmentTonight ? @ Forbes Regular Misporrayal of # Bitcoin # PonziSch',
 'Crypto Twitter is just a circlejerk about Bitcoin.',
 '@aeternity @Dieter75 @giacomozucco @CraigRood @asanso but all these arguments don\'t really matter bc what matters is the price&amp;your shitcoin is currently making new all time lows every single day&amp;there is no other reality than this. you already stole enough btc from your "investors".now it\'s time for your shitcoin to take a good nap',
 '@JoshMcGruff Why Bitcoin Cash chose BCH instead of BCC - as originally planned',
 '@deezy_BTC Sol is shit outta luck',
 "@tysontrades I remember in 2013 people said how ridiculous $1000 a bitcoin was. Could never happen blablabla... I'm not delusional but not trying to predict the future either.",
 'orly 100% has gays that "honestly, work" her when she proposes somethi

In [83]:
df_spam['text'].head(100).tolist()

['# Bitcoin is becoming a nightmare for divorce lawyers http://j.mp/2o5QyOnpic.twitter.com/hPHdKOMvpv',
 'Altcoins See Red as Bitcoin Capitulation Drags Down Market https://t.co/YLetpV6ChA',
 "@Crypto_DashXRP This is where we have to be subjective to information we receive. Because they only want to cherry pick one point in time to justify their article. Look at the big picture and see if you see any correlation over the span of BTC's history with DJI and SPX. Which is almost none. https://t.co/Wp4qLnO61L",
 '#BTC #BTCFX #XRP #XEM #BNB #ETH #LTC #MONA #LSK #FCT #BCH #WAVES #ADA #TRX #BTCB #BCHABC #DASH #XMR #ZEC #DCR #DGD #BTG #REP #Libra #NEO #XZC #ZEN #ETC #EOS #MCO #ATOM #QTUM #LINK #HC #GAS #GXS #WTC #GVT #OMG  こちらのURLから登録で取引手数料6ヶ月間10%割! https://t.co/vDVXxogr6W https://t.co/oRsQ0ot8WA',
 '$GXS. Push! Keep on rising! $BTC market on #Binance. Current Price: Ƀ 0.00019160 Sharing = Pushing!',
 '@PENTA_live https://t.co/h6nsurCNJQ #wonderland100m #Bitcoin #Ethereum #crytocurrency #MATIC

In [None]:
# Apply spam detection
df_300["is_spam"] = df_300["text"].progress_apply(is_spam)

# Keep only non-spam tweets
df_filtered = df_300[df_300["is_spam"] == False].drop(columns=["is_spam"])
df_spam = df_300[df_300["is_spam"] == True].drop(columns=["is_spam"])

# Save the cleaned dataset
df_filtered.to_csv("filtered_tweets.csv", index=False)

100%|██████████| 300/300 [00:30<00:00,  9.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_300["is_spam"] = df_300["text"].progress_apply(is_spam)


In [50]:
df_filtered['text'].head(100).tolist()  # Show 10 rows without truncation

['Altcoins See Red as Bitcoin Capitulation Drags Down Market https://t.co/YLetpV6ChA',
 "@Crypto_DashXRP This is where we have to be subjective to information we receive. Because they only want to cherry pick one point in time to justify their article. Look at the big picture and see if you see any correlation over the span of BTC's history with DJI and SPX. Which is almost none. https://t.co/Wp4qLnO61L",
 '@PENTA_live https://t.co/h6nsurCNJQ #wonderland100m #Bitcoin #Ethereum #crytocurrency #MATIC #Christmas #WorkFromHome #ElonMusk #ShibainuCoin #shiba #shibainu #Eternals #love #finance #FightForVotingRights #prius #TheYearOfGIFs #marchand #durk #prius #twitchstreamers #adam cole #AEWDyanmite',
 '19-year-old bitcoin millionaire: Heres how much you should invest in c http://zentrade.online/19-year-old-bitcoin-millionaire-heres-how-much',
 'With #BlueSparrow, we are building the Web3 system of tomorrow that is more efficient, faster, more rewarding, and levels the playing field for ever

In [51]:
df_spam['text'].head(100).tolist()  # Show 10 rows without truncation

['# Bitcoin is becoming a nightmare for divorce lawyers http://j.mp/2o5QyOnpic.twitter.com/hPHdKOMvpv',
 '#BTC #BTCFX #XRP #XEM #BNB #ETH #LTC #MONA #LSK #FCT #BCH #WAVES #ADA #TRX #BTCB #BCHABC #DASH #XMR #ZEC #DCR #DGD #BTG #REP #Libra #NEO #XZC #ZEN #ETC #EOS #MCO #ATOM #QTUM #LINK #HC #GAS #GXS #WTC #GVT #OMG  こちらのURLから登録で取引手数料6ヶ月間10%割! https://t.co/vDVXxogr6W https://t.co/oRsQ0ot8WA',
 '$GXS. Push! Keep on rising! $BTC market on #Binance. Current Price: Ƀ 0.00019160 Sharing = Pushing!',
 '#Drife #Crypto https://t.co/kigAY37K13',
 'Any cryptocurrency experts out there? Have I missed the # Bitcoin bubble?',
 'Want to Change your life Join Now  https://t.co/2NuQB72yfM  #BITMEX #BTC #ADA #XRP #BCH #EOS #X10 #X20 $RCN $POE $BCN $LOOM $QTUM $SNM $AGI $WINGS $AION $DATA $DNT $OMG $XEM $VIB $ZEN $INS $SC $MDA $STORM',
 '5 Reasons Bitcoin (BTC) Price Has Dropped Below $8,000 https://t.co/x7Ckir4XZX',
 'Y is it @ EmilyKDrewry # Forbes is starting to Sound more like # EntertainmentTonight ? 