<a href="https://colab.research.google.com/github/sahug/ds-nlp/blob/main/NLP%20-%20Session%2024%20-%20Vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Session 24 - Vectorization**

## **Bag of Words**

In [16]:
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

sample_data = ["I like English and I like Spanish",
               "I speak English, French and Thai.",
               "I don't often go swimming; I prefer to play tennis."               
               ]

def bow(text: str) -> pd.DataFrame:
  count_vectorizer = CountVectorizer()
  c_o_w = count_vectorizer.fit_transform(text)
  b_o_w = pd.DataFrame(c_o_w.toarray(), columns=count_vectorizer.get_feature_names())
  return b_o_w

bow(sample_data)



Unnamed: 0,and,don,english,french,go,like,often,play,prefer,spanish,speak,swimming,tennis,thai,to
0,1,0,1,0,0,2,0,0,0,1,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0
2,0,1,0,0,1,0,1,1,1,0,0,1,1,0,1


## **N - Grams**

In [23]:
from typing import List
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

sample = "I don't often go swimming; I prefer to play tennis."

def n_grams(text: str, ngram: int) -> List:
  words = [word for word in text.split(" ") if word not in stopwords]
  collection = zip(*[words[i:] for i in range(0, ngram)])
  n_g = [' '.join(ngram) for ngram in collection]
  return n_g

print("Unigram: ", n_grams(sample, 1))
print("Bigram:  ", n_grams(sample, 2))
print("Trigram: ", n_grams(sample, 3))

Unigram:  ['I', 'often', 'go', 'swimming;', 'I', 'prefer', 'play', 'tennis.']
Bigram:   ['I often', 'often go', 'go swimming;', 'swimming; I', 'I prefer', 'prefer play', 'play tennis.']
Trigram:  ['I often go', 'often go swimming;', 'go swimming; I', 'swimming; I prefer', 'I prefer play', 'prefer play tennis.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **TF - IDF**

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

sample_data = ["I like English and I like Spanish",
               "I speak English, French and Thai.",
               "I don't often go swimming; I prefer to play tennis."               
               ]

def tfidf(text: List) -> pd.DataFrame:
  tfidf_vectorizer  = TfidfVectorizer()
  tfidf_vector = tfidf_vectorizer.fit_transform(sample_data).toarray()
  words_set = tfidf_vectorizer.get_feature_names()
  return pd.DataFrame(tfidf_vector, columns = words_set)

tfidf(sample_data)



Unnamed: 0,and,don,english,french,go,like,often,play,prefer,spanish,speak,swimming,tennis,thai,to
0,0.306504,0.0,0.306504,0.0,0.0,0.806032,0.0,0.0,0.0,0.403016,0.0,0.0,0.0,0.0,0.0
1,0.373022,0.0,0.373022,0.490479,0.0,0.0,0.0,0.0,0.0,0.0,0.490479,0.0,0.0,0.490479,0.0
2,0.0,0.353553,0.0,0.0,0.353553,0.0,0.353553,0.353553,0.353553,0.0,0.0,0.353553,0.353553,0.0,0.353553


## **Word2Vec**

In [31]:
import nltk
import gensim
from nltk.corpus import abc
nltk.download('abc')
nltk.download('punkt')

model = gensim.models.Word2Vec(abc.sents())
X = list(model.wv.vocab)
data = model.most_similar('science')
print(data)

[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('law', 0.9360693693161011), ('agriculture', 0.9288930892944336), ('policy', 0.9266971945762634), ('general', 0.9197111129760742), ('media', 0.9173187613487244), ('practice', 0.916257381439209), ('discussion', 0.9146009087562561), ('reservoir', 0.906642496585846), ('tight', 0.9056570529937744), ('exhibition', 0.9047144651412964)]


  data = model.most_similar('science')
