<a href="https://colab.research.google.com/github/sanchitpawa/Customer-Review/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspellchecker
!pip install emoji --upgrade
from spellchecker import SpellChecker
import re
import spacy
import emoji # The emoji library is now available
import nltk
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import BertTokenizer, BertModel

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt_tab') # Download the punkt_tab data - this is not a standard resource

# Load Spacy model for spell checking
nlp = spacy.load("en_core_web_sm")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Load English words dictionary
english_words = set(words.words())

# Custom slang dictionary
slang_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "lol": "laughing out loud",
    "btw": "by the way",
    "omg": "oh my god",
    "imo": "in my opinion",
    "brb": "be right back",
    "fn": "for now",
    "bfn":"bye for now",
    "ttyl":"talk to you later",
    "rn":"right now",
    "idk":"i dont know",
    "idc":"i dont care",
    "jk":"just kidding",
    "np":"no problem",
    "stfu":"shut the fuck up"
}

# Garbage words to remove
garbage_words = {"im", "you", "are", "the", "is", "and", "to", "of", "a", "in", "on", "at", "it", "he", "she", "but", "or", "this", "that"}

def correct_repeated_chars(word):
    """Reduce repeated characters only if the word is not meaningful."""
    corrected_word = re.sub(r'(\w)\1{2,}', r'\1\1', word)  # Limit max repetition to 2
    if corrected_word in english_words:
        return corrected_word  # Keep if it's a valid word
    return re.sub(r'(\w)\1+', r'\1', word)  # Otherwise, reduce to single occurrence

def check_typos(text):
    """Identifies potential typos in the input text using spaCy."""
    doc = nlp(text)
    typos = []

    # Iterate through each token
    for token in doc:
        # Check if it's not a known word (including proper nouns)
        if (not token.is_stop and not token.is_punct and
            token.text.lower() not in english_words and
            not token.ent_type_):  # Ignore named entities
            typos.append(token.text)

    return typos

def clean_text(text):
    if not text:
        return ""

    # Convert to lowercase
    text = text.lower()

    # Convert emojis to text using emoji.UNICODE_EMOJI['en'] for descriptions
    text = emoji.demojize(text)
    text=text.replace('_',' ')

    # Handle contractions
    contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am"} #Fixed casing of keys
    for contraction, full_form in contractions.items():
        text = text.replace(contraction, full_form)

    # Remove special characters, punctuation, and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize and remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english') and word not in garbage_words]

    # Perform lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    # Handle slang and abbreviations
    words = [slang_dict.get(word, word) for word in words]

    # Reduce repeated characters meaningfully
    words = [correct_repeated_chars(word) for word in words]

    spell=SpellChecker()

    #Check typos
    for i, word in enumerate(words):
        if word not in english_words and word not in slang_dict: # Check if the word is potentially misspelled
            corrected_word = spell.correction(word) # Get the corrected word
            if corrected_word: # If a correction is found, replace the original word
                words[i] = corrected_word

    # Reconstruct text
    cleaned_text = " ".join(words)

    return cleaned_text

# Example input
sentence = "Jus got back from the market 🛍️, but I'm alrady feling so tired 😴, so I'll cach up with you laer, BTW!"
cleaned_text = clean_text(sentence)
tokens = tokenizer(cleaned_text, return_tensors='pt')
with torch.no_grad():
    outputs = model(**tokens)
word_embeddings = outputs.last_hidden_state
tokens_list = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
for i, token in enumerate(tokens_list):
    print(f"Token: {token}")
    print(f"Embedding: {word_embeddings[0, i, :10]}")  # Show first 5 dimensions
    print("-" * 30)
print(cleaned_text)

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Token: [CLS]
Embedding: tensor([ 0.2015,  0.1247,  0.1032, -0.3310, -0.4772, -0.4160,  0.6657,  0.6381,
         0.2915, -0.3120])
------------------------------
Token: just
Embedding: tensor([ 0.4838, -0.5985,  0.5340,  0.1202, -0.0345,  0.0592,  0.0201,  0.3896,
         0.4966, -0.7957])
------------------------------
Token: got
Embedding: tensor([ 0.1459, -1.1830,  1.0727,  0.2276, -0.4507, -0.6790,  0.7212,  1.0730,
         0.0672, -0.2899])
------------------------------
Token: back
Embedding: tensor([ 0.1043, -0.6520,  0.7663, -0.3329,  0.1703, -0.4904,  0.8115,  0.8242,
         0.1376, -0.1696])
------------------------------
Token: market
Embedding: tensor([ 0.0363,  0.2210,  0.0228,  0.0912, -0.1474, -0.5004,  0.2415, -0.3046,
        -0.0007, -0.5107])
------------------------------
Token: shopping
Embedding: tensor([ 1.0752, -0.2993,  0.8696, -0.3887,  0.5847, -0.1920,  0.6979,  0.2812,
        -0.4420, -0.0237])
------------------------------
Token: bag
Embedding: tensor