In [1]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.9.11-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ------------------- -------------------- 20.5/41.5 kB ? eta -:--:--
     ------------------- -------------------- 20.5/41.5 kB ? eta -:--:--
     ---------------------------- --------- 30.7/41.5 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 41.5/41.5 kB 250.5 kB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    ------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...


True

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

In [5]:
sentence = "I ate an apple today! It was red and delicious."

In [6]:
sentence.split()

['I', 'ate', 'an', 'apple', 'today!', 'It', 'was', 'red', 'and', 'delicious.']

In [8]:
tokens = word_tokenize(sentence)
tokens

['I',
 'ate',
 'an',
 'apple',
 'today',
 '!',
 'It',
 'was',
 'red',
 'and',
 'delicious',
 '.']

In [10]:
tokens = [word.lower() for word in tokens if word.isalpha()]
tokens

['i', 'ate', 'an', 'apple', 'today', 'it', 'was', 'red', 'and', 'delicious']

In [12]:
# stopwords is used to identify small words so that we are able to capture important words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
tokens

['ate', 'apple', 'today', 'red', 'delicious']

In [13]:
# stemming
stemmer = PorterStemmer()

# lemmatization
lematizer = WordNetLemmatizer()

In [15]:
# Stemming and lematization is used to group common words which is not the case for bag of words
stemmed_tokens = [stemmer.stem(token) for token in tokens]
lematized_tokens = [lematizer.lemmatize(token) for token in tokens]

In [16]:
print(stemmed_tokens)
print(lematized_tokens)

['ate', 'appl', 'today', 'red', 'delici']
['ate', 'apple', 'today', 'red', 'delicious']


In [17]:
dictionary = {}
for i in lematized_tokens:
    if i not in dictionary.keys():
        dictionary[i] = 1
    else:
        dictionary[i] += 1

print(dictionary)

{'ate': 1, 'apple': 1, 'today': 1, 'red': 1, 'delicious': 1}


In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
list1 = ["I love the product. It's amazing and easy to use.", "The product did not work as expected. I'm disappointed.",
 "Fantastic product! I would recommend to everyone.", "Terrible product. Waste of money."]

In [22]:
# Bag of words Model
vectorizer_bow = CountVectorizer()

X_bow = vectorizer_bow.fit_transform(list1)
df1 = pd.DataFrame(X_bow.toarray(), columns = vectorizer_bow.get_feature_names_out())
df1

Unnamed: 0,amazing,and,as,did,disappointed,easy,everyone,expected,fantastic,it,...,of,product,recommend,terrible,the,to,use,waste,work,would
0,1,1,0,0,0,1,0,0,0,1,...,0,1,0,0,1,1,1,0,0,0
1,0,0,1,1,1,0,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,1,0,1,0,...,0,1,1,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0


In [23]:
# n-gram model (bigram)

vector_ngram = CountVectorizer(ngram_range = (2,2))
X_ngram = vector_ngram.fit_transform(list1)
df2 = pd.DataFrame(X_ngram.toarray(), columns=vector_ngram.get_feature_names_out())

In [24]:
df2.columns

Index(['amazing and', 'and easy', 'as expected', 'did not', 'easy to',
       'expected disappointed', 'fantastic product', 'it amazing', 'love the',
       'not work', 'of money', 'product did', 'product it', 'product waste',
       'product would', 'recommend to', 'terrible product', 'the product',
       'to everyone', 'to use', 'waste of', 'work as', 'would recommend'],
      dtype='object')

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(list1)
df3 = pd.DataFrame(X_tfidf.toarray(), columns = vectorizer_tfidf.get_feature_names_out())
df3

Unnamed: 0,amazing,and,as,did,disappointed,easy,everyone,expected,fantastic,it,...,of,product,recommend,terrible,the,to,use,waste,work,would
0,0.364772,0.364772,0.0,0.0,0.0,0.364772,0.0,0.0,0.0,0.364772,...,0.0,0.190353,0.0,0.0,0.28759,0.28759,0.364772,0.0,0.0,0.0
1,0.0,0.0,0.380862,0.380862,0.380862,0.0,0.0,0.380862,0.0,0.0,...,0.0,0.198749,0.0,0.0,0.300276,0.0,0.0,0.0,0.380862,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.452035,0.0,0.452035,0.0,...,0.0,0.235891,0.452035,0.0,0.0,0.356389,0.0,0.0,0.0,0.452035
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.483803,0.252468,0.0,0.483803,0.0,0.0,0.0,0.483803,0.0,0.0


In [28]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.0 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.0 kB 217.9 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/61.0 kB 279.3 kB/s eta 0:00:01
     -------------------------------------- 61.0/61.0 kB 360.3 kB/s eta 0:00:00
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     --------------------------------- ------ 51.2/60.6 kB 2.7 MB/s eta 0:00:01
     ---------------------------------------- 60.6/60.6 kB 1.1 MB/s eta 

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import gensim
from gensim.models import Word2Vec

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
reviews = [
 word_tokenize("I love the product. It is amazing and easy to use."),
word_tokenize("The product did not work as expected. Very disappointing."),
word_tokenize("Fantastic product! I would recommend it to anyone."),
word_tokenize("Terrible product, waste of money.")
]
w2v_model = Word2Vec(reviews, vector_size=50, window=5, min_count=1, workers=4)