In [48]:
import pandas as pd 
import numpy as np
from collections import defaultdict
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer
from textblob import TextBlob, Word
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
note1 = 'WORKER slipped while carrying groceries. Worker fractured his elbow'
note2 = 'worker developed carpal tunnel from repetitive typing'
note3 = 'worker got traumatized from NLP presentation '

In [3]:
corpus = note1 + ' '+note2 + ' ' + note3

In [4]:
corpus

'WORKER slipped while carrying groceries. Worker fractured his elbow worker developed carpal tunnel from repetitive typing worker got traumatized from NLP presentation '

In [5]:
vocab = defaultdict(int)
for word in corpus.split():
    vocab[word] = 1
unique_words = vocab.keys()

In [6]:
sorted(unique_words) # our vocabulory-- list of distinct words

['NLP',
 'WORKER',
 'Worker',
 'carpal',
 'carrying',
 'developed',
 'elbow',
 'fractured',
 'from',
 'got',
 'groceries.',
 'his',
 'presentation',
 'repetitive',
 'slipped',
 'traumatized',
 'tunnel',
 'typing',
 'while',
 'worker']

**Preprocessing**    
Lowering the case    
stemming    
lemmatization

**Lowering the case**

In [7]:
def lower(txt):
    print(txt.lower())
    return txt.lower()

In [8]:
lower(note1)
lower(note2)
lower(note3)

worker slipped while carrying groceries. worker fractured his elbow
worker developed carpal tunnel from repetitive typing
worker got traumatized from nlp presentation 


'worker got traumatized from nlp presentation '

**Stemming**

In [9]:
# We need to stem each word in the sentence as stemmer sees each word as a sentence
def stemmer(sentence):
    '''the function takes the sentence as an argument and returns the sentence with each word stemmed'''
    stemmer = PorterStemmer()
    return_str = []
    for word in sentence.split():
        word = word.lower()
        return_str.append(stemmer.stem(word))
    print(' '.join(return_str))
    return ' '.join(return_str)
    

In [10]:
stemmer(note1)
stemmer(note2)
stemmer(note3)

worker slip while carri groceries. worker fractur hi elbow
worker develop carpal tunnel from repetit type
worker got traumat from nlp present


'worker got traumat from nlp present'

**Lemmatization with WordNetLemmatizer**

In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pradnya.n/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def lemmatizer(sentence):
    '''the function takes the sentence as an argument and returns the sentence with each word lemmatized'''
    lemma = WordNetLemmatizer()
    return_str = []
    for word in sentence.split():
        word = word.lower()
        return_str.append(lemma.lemmatize(word))
    print(' '.join(return_str))

In [13]:
lemmatizer(note1)
lemmatizer(note2)
lemmatizer(note3)

worker slipped while carrying groceries. worker fractured his elbow
worker developed carpal tunnel from repetitive typing
worker got traumatized from nlp presentation


In [14]:
lemma = WordNetLemmatizer()
doc1 = lemma.lemmatize(note1.lower())
print(" ".join([lemma.lemmatize(token) for token in doc1.split()]))

worker slipped while carrying groceries. worker fractured his elbow


In [15]:
doc2 = lemma.lemmatize(note2.lower())
print(" ".join([lemma.lemmatize(token) for token in doc2.split()]))

worker developed carpal tunnel from repetitive typing


In [16]:
doc3 = lemma.lemmatize(note3.lower())
print(" ".join([lemma.lemmatize(token) for token in doc3.split()]))

worker got traumatized from nlp presentation


In [17]:
def lemmatizer2(sentence):
    '''the function takes the sentence as an argument and returns the sentence with each word lemmatized'''
    lemma = WordNetLemmatizer()
    return_str = []
    for word in sentence.split():
        word = word.lower()
        w = Word(word)
        return_str.append(w.lemmatize())
    print(' '.join(return_str))

In [18]:
lemmatizer2(note1)
lemmatizer2(note2)
lemmatizer2(note3)

worker slipped while carrying groceries. worker fractured his elbow
worker developed carpal tunnel from repetitive typing
worker got traumatized from nlp presentation


In [19]:
def tokenize(sentence):
    tokens = lower(sentence)
    tokens = stemmer(tokens)
    tokens = lemmatizer2(tokens)
    return tokens

In [20]:
tokenize(note1)

worker slipped while carrying groceries. worker fractured his elbow
worker slip while carri groceries. worker fractur hi elbow
worker slip while carri groceries. worker fractur hi elbow


** Spacy Lemmatization**

In [21]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp_spacy = spacy.load('en', disable=['parser', 'ner'])

In [22]:
doc1 = nlp_spacy(note1.lower())
# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc1])

'worker slip while carry grocery . worker fracture -PRON- elbow'

In [23]:
doc2 = nlp_spacy(note2.lower())
# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc2])

'worker develop carpal tunnel from repetitive typing'

In [24]:
doc3 = nlp_spacy(note3.lower())
# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc3])

'worker get traumatize from nlp presentation'

In [25]:
corpus = [note1, note2, note3]

**CountVectorizer**

In [26]:
countvec = CountVectorizer()
X = countvec.fit_transform(corpus)
header = countvec.get_feature_names()
idx = ['doc_1', 'doc_2', 'doc_3']
countvec = X.toarray()

countvec = pd.DataFrame(countvec, columns = header, index = idx)
countvec

Unnamed: 0,carpal,carrying,developed,elbow,fractured,from,got,groceries,his,nlp,presentation,repetitive,slipped,traumatized,tunnel,typing,while,worker
doc_1,0,1,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1,2
doc_2,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1
doc_3,0,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1


In [27]:
countvec.to_excel('Count_Vectorizer.xlsx')

**TFIDF**

In [39]:
tfidf_vec = TfidfVectorizer()
Y = tfidf_vec.fit_transform(corpus)
header = tfidf_vec.get_feature_names()
idx = ['doc_1', 'doc_2', 'doc_3']
tfidf_vec = Y.toarray()

tfidf_vec = pd.DataFrame(tfidf_vec, columns = header, index = idx)
tfidf_vec


Unnamed: 0,carpal,carrying,developed,elbow,fractured,from,got,groceries,his,nlp,presentation,repetitive,slipped,traumatized,tunnel,typing,while,worker
doc_1,0.0,0.345129,0.0,0.345129,0.345129,0.0,0.0,0.345129,0.345129,0.0,0.0,0.0,0.345129,0.0,0.0,0.0,0.345129,0.407678
doc_2,0.410747,0.0,0.410747,0.0,0.0,0.312384,0.0,0.0,0.0,0.0,0.0,0.410747,0.0,0.0,0.410747,0.410747,0.0,0.242594
doc_3,0.0,0.0,0.0,0.0,0.0,0.34262,0.450504,0.0,0.0,0.450504,0.450504,0.0,0.0,0.450504,0.0,0.0,0.0,0.266075


In [40]:
tfidf_vec.to_excel('TFIDF_Vectorizer.xlsx')

calculating for word "worker" in note_1

In [46]:
tfidf_vec2 = TfidfVectorizer(norm= None)
Y2 = tfidf_vec2.fit_transform(corpus)
header = tfidf_vec2.get_feature_names()
idx = ['doc_1', 'doc_2', 'doc_3']
tfidf_vec2 = Y2.toarray()

tfidf_vec2 = pd.DataFrame(tfidf_vec2, columns = header, index = idx)
tfidf_vec2


Unnamed: 0,carpal,carrying,developed,elbow,fractured,from,got,groceries,his,nlp,presentation,repetitive,slipped,traumatized,tunnel,typing,while,worker
doc_1,0.0,1.693147,0.0,1.693147,1.693147,0.0,0.0,1.693147,1.693147,0.0,0.0,0.0,1.693147,0.0,0.0,0.0,1.693147,2.0
doc_2,1.693147,0.0,1.693147,0.0,0.0,1.287682,0.0,0.0,0.0,0.0,0.0,1.693147,0.0,0.0,1.693147,1.693147,0.0,1.0
doc_3,0.0,0.0,0.0,0.0,0.0,1.287682,1.693147,0.0,0.0,1.693147,1.693147,0.0,0.0,1.693147,0.0,0.0,0.0,1.0


In [47]:
tf = 2
N = 3
df = 3
idf = (np.log((N+1)/(1+df)))+1
doc1 = tfidf_vec2.iloc[0,:]
norm = np.linalg.norm(doc1)
print(tf*idf/norm)

0.4076776724611254


In [45]:
doc1

carpal          0.000000
carrying        0.345129
developed       0.000000
elbow           0.345129
fractured       0.345129
from            0.000000
got             0.000000
groceries       0.345129
his             0.345129
nlp             0.000000
presentation    0.000000
repetitive      0.000000
slipped         0.345129
traumatized     0.000000
tunnel          0.000000
typing          0.000000
while           0.345129
worker          0.407678
Name: doc_1, dtype: float64

** Cosine similarity**

In [49]:
#vec1 = np.array([2.3,0,1.5]).reshape(1,3)
vec1 = np.array([[2.3,0,1.5]]) 
vec2 = np.array([5.4,2,0]).reshape(1,3)

In [50]:
cosine_similarity(vec1, vec2)

array([[0.7854683]])

**LSA (Topic Modeling)**

In [72]:
Y.T.shape # check shape of the term-document matrix

(18, 3)

In [52]:
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=42) ## n_componemts is the k

In [73]:
svd_model.fit(Y.T)
print(svd_model.singular_values_)

[1.12058087 0.95723259]


In [74]:
VT = svd_model.components_
VT

array([[ 0.49760883,  0.60774595,  0.61889443],
       [ 0.86520243, -0.3985414 , -0.30428525]])

In [76]:
from sklearn.utils.extmath import randomized_svd
n = 2
U, Sigma, VT = randomized_svd(Y.T, 
                              n_components=n,
                              n_iter=100,
                              random_state=42) # sklearn's implementation only shows the k largest singular values

In [68]:
U

array([[ 0.22276815, -0.17101342],
       [ 0.15325919,  0.31194778],
       [ 0.22276815, -0.17101342],
       [ 0.15325919,  0.31194778],
       [ 0.15325919,  0.31194778],
       [ 0.35864919, -0.2389722 ],
       [ 0.24881244, -0.14320631],
       [ 0.15325919,  0.31194778],
       [ 0.15325919,  0.31194778],
       [ 0.24881244, -0.14320631],
       [ 0.24881244, -0.14320631],
       [ 0.22276815, -0.17101342],
       [ 0.15325919,  0.31194778],
       [ 0.24881244, -0.14320631],
       [ 0.22276815, -0.17101342],
       [ 0.22276815, -0.17101342],
       [ 0.15325919,  0.31194778],
       [ 0.45955778,  0.18289954]])

In [69]:
Sigma

array([1.12058087, 0.95723259])

In [70]:
VT

array([[ 0.49760883,  0.60774595,  0.61889443],
       [ 0.86520243, -0.3985414 , -0.30428525]])