In [None]:
!pip install datasets spacy nltk --quiet
!python -m spacy download en_core_web_sm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m358.4/491.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import re
import spacy
import nltk
from nltk.corpus import stopwords
from string import punctuation
from datasets import load_dataset
from collections import Counter
import json


nltk.download("stopwords")
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+', '', text)         # removing URLs
    text = re.sub(r'[^a-z\s]', '', text)             # keeping only a-z
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in docs
        if token.is_alpha and token.text not in stop_words and token.text not in punctuation
    ]
    return tokens


In [None]:

df = pd.read_csv("data.csv")
df["tokens"] = df["Sentence"].apply(preprocess_text)

print(f"✅ Your CSV: {len(df)} rows")
df[["Sentence", "tokens"]].head()


json_data = df.to_json(orient="records", indent=2)
with open("preprocessed_data.json", "w") as f:
    f.write(json_data)

print("✅ JSON file saved as 'preprocessed_data.json'")


✅ Your CSV: 5842 rows
✅ JSON file saved as 'preprocessed_data.json'


In [None]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

ds = load_dataset("StephanAkkerman/stock-market-tweets-data")
hf_data = ds["train"]

print(f"Loaded {len(hf_data)} rows from Hugging Face dataset")

hf_tokens = []
for item in tqdm(hf_data, desc="Preprocessing FULL HuggingFace dataset"):
    tokens = preprocess_text(item["text"])
    if tokens:
        hf_tokens.append(tokens)

hf_df = pd.DataFrame([{"tokens": tokens} for tokens in hf_tokens])
print(f"✅ Hugging Face dataset (preprocessed): {len(hf_df)} rows")
hf_df.head()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

stock-market-tweets-data.csv:   0%|          | 0.00/175M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/923673 [00:00<?, ? examples/s]

Loaded 923673 rows from Hugging Face dataset


Preprocessing FULL HuggingFace dataset: 100%|██████████| 923673/923673 [2:32:54<00:00, 100.68it/s]


✅ Hugging Face dataset (preprocessed): 923664 rows


Unnamed: 0,tokens
0,"[kennydegu, little, volume, think, could, spx,..."
1,"[esf, achieve, target, closing, fibonacci, lev..."
2,"[rt, kimblecharte, silvergold, indicator, crea..."
3,"[issaquahfund, hedge, msft, position, close, s..."
4,"[rt, zipillinois, surprisingly, controversial,..."


In [None]:
combined_df = pd.concat([df[["tokens"]], hf_df], ignore_index=True)
print(f"🔗 Combined dataset: {len(combined_df)} sentences")


🔗 Combined dataset: 929506 sentences


In [None]:
all_tokens = [token for tokens in combined_df["tokens"] for token in tokens]
word_freq = Counter(all_tokens)

MIN_FREQ = 5 # standard value
filtered_tokens = {word: freq for word, freq in word_freq.items() if freq >= MIN_FREQ}

vocab = sorted(filtered_tokens.keys())
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

print(f"🧠 Vocab size: {vocab_size}")

with open("vocab.json", "w") as f:
    json.dump({"word2idx": word2idx, "idx2word": idx2word}, f, indent=2)

print("📁 Saved vocab.json")


🧠 Vocab size: 52280
📁 Saved vocab.json


In [None]:
import json

with open("vocab.json", "r") as f:
    vocab_data = json.load(f)

# Displaying the data
print(vocab_data)



In [None]:
from google.colab import files

In [None]:
files.download('vocab.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>