In [1]:
!pip install nltk




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Text Preprocessing

## Lowercasing

In [2]:
sent = "Hello, i am AI sathi R@3#"

In [3]:
lower_sent = sent.lower()
lower_sent

'hello, i am ai sathi r@3#'

## Removal of punctuation and specical character

In [4]:
common_punctuation = ['.', ',', ':',';','!', '?', '(',')', '"','""']

In [5]:
result = " "
for each in lower_sent:
    if each not in common_punctuation:
        result += each
result        

' hello i am ai sathi r@3#'

In [6]:
import re
cleaned = re.sub(r'[^\w\s]','', lower_sent)
cleaned

'hello i am ai sathi r3'

## Stop Word removal

In [7]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prate\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords

In [9]:
stopwords_eng = stopwords.words("english")

In [10]:
stopwords_eng

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [11]:
filtered = [word for word in cleaned.split(" ") if word not in stopwords_eng]

In [12]:
filtered = " ".join(filtered)

## Tokenization

### 1 Word tokenization

In [13]:
import nltk 
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
tokens = nltk.word_tokenize(filtered)
tokens

['hello', 'ai', 'sathi', 'r3']

### 2 Stemming and Lemmatization

#### stemming

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
from nltk import LancasterStemmer, PorterStemmer, SnowballStemmer 

In [17]:

stemmer = LancasterStemmer()
stemmer.stem("lemmatization")

'lem'

#### Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('tasty')

'tasty'

### 3 Word Embeddings

### Bag of Word (BoW)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
corpus = [
    "This product is good and is affordable.",
    "This product is not good and affordable.",
    "This product is good and cheap."
]
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(corpus)

In [22]:
matrix.toarray()

array([[1, 1, 0, 1, 2, 0, 1, 1],
       [1, 1, 0, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 0, 1, 1]], dtype=int64)

### TF-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
corpus = [
    "This product is good and is affordable.",
    "This product is not good and affordable.",
    "This product is good and cheap."
]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(corpus)

In [25]:
matrix.toarray()

array([[0.41434513, 0.32177595, 0.        , 0.32177595, 0.64355191,
        0.        , 0.32177595, 0.32177595],
       [0.4172334 , 0.32401895, 0.        , 0.32401895, 0.32401895,
        0.54861178, 0.32401895, 0.32401895],
       [0.        , 0.35653519, 0.60366655, 0.35653519, 0.35653519,
        0.        , 0.35653519, 0.35653519]])

### Word2Vec

In [26]:
from gensim.models import Word2Vec

In [27]:
corpus = [
    "I love cats",
    "I adore felines",
    "Dogs are loyal",
    "Cats and dogs are pets",
    "The sun is shining"
]

tokenized = [sent.lower().split() for sent in corpus]

In [28]:
tokenized

[['i', 'love', 'cats'],
 ['i', 'adore', 'felines'],
 ['dogs', 'are', 'loyal'],
 ['cats', 'and', 'dogs', 'are', 'pets'],
 ['the', 'sun', 'is', 'shining']]

In [29]:
cbow_model = Word2Vec(sentences=tokenized, vector_size=100, sg = 0, min_count=1)
sg_model = Word2Vec(sentences=tokenized, vector_size=100, sg = 1, min_count=1)


In [30]:
cbow_model.wv.most_similar(['love'])

[('shining', 0.25290459394454956),
 ('dogs', 0.13725271821022034),
 ('is', 0.04410674050450325),
 ('sun', 0.01273240614682436),
 ('i', 0.006598459556698799),
 ('loyal', -0.0011978191323578358),
 ('the', -0.025461023673415184),
 ('are', -0.04125342145562172),
 ('adore', -0.07639002799987793),
 ('and', -0.10619832575321198)]

In [31]:
sg_model.wv.most_similar(['love'])

[('shining', 0.25290459394454956),
 ('dogs', 0.13725271821022034),
 ('is', 0.04410674050450325),
 ('sun', 0.012810827232897282),
 ('i', 0.006598459556698799),
 ('loyal', -0.0011978191323578358),
 ('the', -0.025461023673415184),
 ('are', -0.04125342145562172),
 ('adore', -0.07639002799987793),
 ('and', -0.10619832575321198)]

### BERT (Bidirectional Encoder Representation  from Tranformers)

In [1]:
from transformers import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
tokens = tokenizer(text = "The quick brown fox jumps over the lazy dog.",return_tensors = 'pt')


In [5]:
tokens

{'input_ids': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
embeddings = model(**tokens).last_hidden_state

In [7]:
embeddings.shape

torch.Size([1, 12, 768])