In [1]:
import nltk
nltk.download('punkt')


#Tokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

#Stop Words
from nltk.corpus import stopwords
nltk.download('stopwords')

#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import numpy as np
import pandas as pnd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## BAG OF WORDS

In [2]:
sentence_1 = "The Catholic University of Lille is a very good school"
sentence_2 = "I am training in NLP at the university"
sentence_3 = "The NLP teacher is really great !"

In [3]:
def preprocessing(sentence):

  #To lower
  sentence = sentence.lower()

  #Remove punctuation
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(sentence)

  #Remove Stop Words
  stopWords = stopwords.words('english')
  tokens = [token for token in tokens if token not in stopWords]

  #Lemmatization
  lmtzr = WordNetLemmatizer()
  tokens = [lmtzr.lemmatize(word) for word in tokens]

  return tokens

In [4]:
t1 = preprocessing(sentence_1)
t2 = preprocessing(sentence_2)
t3 = preprocessing(sentence_3)
print(t1)
print(t2)
print(t3)
tokens = t1 + t2 + t3
tokens

['catholic', 'university', 'lille', 'good', 'school']
['training', 'nlp', 'university']
['nlp', 'teacher', 'really', 'great']


['catholic',
 'university',
 'lille',
 'good',
 'school',
 'training',
 'nlp',
 'university',
 'nlp',
 'teacher',
 'really',
 'great']

In [5]:
#Corpus (unique tokens)
words = []
for token in tokens:
  if token not in words:
      words.append(token)
words

['catholic',
 'university',
 'lille',
 'good',
 'school',
 'training',
 'nlp',
 'teacher',
 'really',
 'great']

In [7]:
def createVector(words, tokens):
  vector=[]
  for word in words:
    nb = 0
    for token in tokens:
      if token == word:
        nb+=1
    vector.append(nb)
  return vector


allVectors = []
v1 = createVector(words, t1)
allVectors.append(v1)
v2 = createVector(words, t2)
allVectors.append(v2)
v3 = createVector(words, t3)
allVectors.append(v3)

In [8]:
import pandas as pnd
df= pnd.DataFrame(data=np.array(allVectors),columns=words)
df.head()

Unnamed: 0,catholic,university,lille,good,school,training,nlp,teacher,really,great
0,1,1,1,1,1,0,0,0,0,0
1,0,1,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,1,1,1,1


## TF-IDF

In [9]:
sentence_1 = "cat dog hen"
sentence_2 = "cat"

In [12]:
#Tokens
t1 = word_tokenize(sentence_1)
t2 = word_tokenize(sentence_2)
tokens = t1+t2

#Unique words
words = []
for token in tokens:
  if token not in words:
      words.append(token)
print("Unique words:",words)

Unique words: ['cat', 'dog', 'hen']


In [11]:
def frequency(words, tokens):
  f=[]
  for word in words:
    nb = 0
    for token in tokens:
      if token == word:
        nb+=1
    f.append(nb)
  return f

FREQ_S1 = frequency(words, t1)
FREQ_S2 = frequency(words, t2)

print("Frequencies S1:",FREQ_S1)
print("Frequencies S2:",FREQ_S2)

Frequencies S1: [1, 1, 1]
Frequencies S2: [1, 0, 0]


##### Term Frequency

In [13]:
def computeNumbersOfWords(frequencies):
  nbWords = 0
  for i in range(0,len(frequencies)):
    nbWords += frequencies[i]
  return (nbWords)

nbw_s1 = computeNumbersOfWords(FREQ_S1)
nbw_s2 = computeNumbersOfWords(FREQ_S2)
nbTotalWords = nbw_s1+nbw_s2
print("Number of words: ",nbTotalWords)

Number of words:  4


In [14]:
def computeTF(frequencies,nbWords):
  TF=[]
  for i in range(0,len(frequencies)):
    TF.append(frequencies[i] / nbWords)

  return TF

In [15]:
TF_S1=computeTF(FREQ_S1,nbTotalWords)
print("TF Sentence 1 :",TF_S1)

TF_S2=computeTF(FREQ_S2,nbTotalWords)
print("TF Sentence 2 :",TF_S2)

TF Sentence 1 : [0.25, 0.25, 0.25]
TF Sentence 2 : [0.25, 0.0, 0.0]


##### Inverse Term Frequency

In [16]:
def computeDF(sentenceFrequencies):
  
  print("Nb of sentences",len(sentenceFrequencies))

  nbFreq = len(sentenceFrequencies[0])
  print("Nb freq per sentences :",len(sentenceFrequencies[0]))

  df = []
  for i in range(0,nbFreq):
    df_word = 0
    for j in range(0,len(sentenceFrequencies)):
      df_word += sentenceFrequencies[j][i]
    df.append(df_word)

  return df

In [17]:
freq = []
freq.append(FREQ_S1)
freq.append(FREQ_S2)

In [18]:
df = computeDF(freq)
print(df)

Nb of sentences 2
Nb freq per sentences : 3
[2, 1, 1]


In [19]:
import math

def computeIDF(frequencies,df,n):
  
  IDF = []
  for i in range(0,len(frequencies)):
    idfWord = math.log((n+1)/(df[i]+1))+1
    IDF.append(idfWord)

  return IDF 

In [20]:
nbSentences = 2
IDF_S1 = computeIDF(FREQ_S1,df,nbSentences)
print(IDF_S1)
IDF_S2 = computeIDF(FREQ_S2,df,nbSentences)
print(IDF_S2)

[1.0, 1.4054651081081644, 1.4054651081081644]
[1.0, 1.4054651081081644, 1.4054651081081644]


#### TF-IDF

In [21]:
def computeTFIDF(TF,IDF):
  TFIDF = []
  for i in range(0,len(TF)):
    TFIDF.append(TF[i]*IDF[i])
  return TFIDF

In [22]:
TFIDF_S1 = computeTFIDF(TF_S1,IDF_S1)
print("TF-IDF Sentence 1 :",TFIDF_S1)
TFIDF_S2 = computeTFIDF(TF_S2,IDF_S2)
print("TF-IDF Sentence 2 :",TFIDF_S2)

TF-IDF Sentence 1 : [0.25, 0.3513662770270411, 0.3513662770270411]
TF-IDF Sentence 2 : [0.25, 0.0, 0.0]


In [23]:
datas=[]
datas.append(TFIDF_S1)
datas.append(TFIDF_S2)

df  = pnd.DataFrame(data=np.array(datas), columns=words)
df.head()

Unnamed: 0,cat,dog,hen
0,0.25,0.351366,0.351366
1,0.25,0.0,0.0


## TF-IDF SKLEARN

In [24]:
sentence_1 = "cat dog hen"
sentence_2 = "cat"

corpus=[]
corpus.append(sentence_1)
corpus.append(sentence_2)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pnd
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)
df = pnd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,cat,dog,hen
0,0.449436,0.631667,0.631667
1,1.0,0.0,0.0


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Norm l2 : Sum of squares of vector elements is 1
vectorizer = TfidfVectorizer(norm='l2')

X = vectorizer.fit_transform(corpus)
df = pnd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,cat,dog,hen
0,0.449436,0.631667,0.631667
1,1.0,0.0,0.0


#### L2 Norm

In [27]:
#Whith out normalisation: The norm equals to 0.55
print(math.sqrt(0.25**2+0.351366**2+0.351366**2))

0.5562518601425077


In [28]:
#With normalisation (sklearn), the Euclidian norm of the vector = 1
print(math.sqrt(0.449436**2+0.631667**2+0.631667**2))

0.9999995579369023


#### Computing normalisation like SKLearn

In [29]:
#l2 norm : Sentence 1 
l2Norm = math.sqrt(0.25**2+0.351366**2+0.351366**2)
print(l2Norm)

0.5562518601425077


In [30]:
#Normalization : Sentence 1 
print(0.25/l2Norm)
print(0.351366/l2Norm)
print(0.351366/l2Norm)

0.44943669929652336
0.631667101140089
0.631667101140089


In [31]:
def computeL2Normalization(vector):
  
  sum=0
  for i in range(0,len(vector)):
    sum+= vector[i]**2
  l2Norm = math.sqrt(sum)

  normalization = []
  for i in range(0,len(vector)):
    norm = vector[i]/l2Norm
    normalization.append(norm)
  
  return normalization

In [32]:
computeL2Normalization(TFIDF_S1)

[0.4494364165239821, 0.6316672017376245, 0.6316672017376245]

In [33]:
computeL2Normalization(TFIDF_S2)

[1.0, 0.0, 0.0]

In [34]:
from sklearn.preprocessing import Normalizer
Normalizer(norm="l2").fit_transform(np.array(TFIDF_S1).reshape(-1,1))

array([[1.],
       [1.],
       [1.]])