In [175]:
#CRUD and preprocess
import nltk
import pandas as pd
import contractions
import re
import numpy as np

# nltk.download('wordnet')
# nltk.download('stopwords')
#nltk.download('punkt')

#Vectorize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import spacy

#general
import os
import sys
import time

In [None]:
print("Quick Installation : ")
# !pip install numpy
# !pip install pandas
# !pip install contractions
# !pip install nltk
# !pip install sklearn
# !pip install gensim

os.system("python -m spacy download en_core_web_sm")



### Sample Data

In [120]:
data_dict = {
    'Text' : [
        "How are you guys doing today?",
        "Welcome to the Hands-on Session ?",
        "MS Teams is fun, isn't ? team building LOL",
        "I would rather come in person ! in person is much better",
        "I love swimming and playing badminton",
        "I miss Indian food and swim pool, but food the most"
    ]
}

data_df = pd.DataFrame(data_dict)

data_df

Unnamed: 0,Text
0,How are you guys doing today?
1,Welcome to the Hands-on Session ?
2,"MS Teams is fun, isn't ? team building LOL"
3,I would rather come in person ! in person is m...
4,I love swimming and playing badminton
5,"I miss Indian food and swim pool, but food the..."


In [121]:
def textPreprocessing(text):

    """
    input : text (str)
    function : perform text pre-processing :
              1. Case Generalization
              2. Remove Contractions
              3. Punctuation Removal
              4. Tokenization
              5. Stemming
              6. Stopword Removal
    output : text (str)
    """

    #converting to lower case
    text = text.lower()
    
    #removing contractions
    text = contractions.fix(text)

    #Punctuation Removal
    text = re.sub(r'[^\w\s]', '', text)

    #tokenizaton
    token_list = nltk.WordPunctTokenizer().tokenize(text)  

    #stemming
    ps = nltk.stem.PorterStemmer()
    token_list = [ps.stem(token) for token in token_list]
    
    #removing stopwords
    stopwords_list = nltk.corpus.stopwords.words('english')
    sanitized_token_list = [word for word in token_list if word not in stopwords_list]
    
    return " ".join(sanitized_token_list)

In [122]:
data_df['Processed_Text'] = data_df.Text.apply(textPreprocessing)

data_df

Unnamed: 0,Text,Processed_Text
0,How are you guys doing today?,guy today
1,Welcome to the Hands-on Session ?,welcom handson session
2,"MS Teams is fun, isn't ? team building LOL",ms team fun team build lol
3,I would rather come in person ! in person is m...,would rather come person person much better
4,I love swimming and playing badminton,love swim play badminton
5,"I miss Indian food and swim pool, but food the...",miss indian food swim pool food


### Vectorization

In [127]:
# vectorizer = CountVectorizer()

# X = vectorizer.fit_transform(data_df['Text'])

# count_matrix = X.toarray()

# feature_names = vectorizer.get_feature_names_out()

# count_df = pd.DataFrame(count_matrix, columns=feature_names)

# print("Binary Vectorizer Count Matrix:")
# print(count_df)


In [128]:
# vectorizer = CountVectorizer(binary = True)

# X = vectorizer.fit_transform(data_df['Text'])

# count_matrix = X.toarray()

# feature_names = vectorizer.get_feature_names_out()

# count_df = pd.DataFrame(count_matrix, columns=feature_names)

# print("Binary Vectorizer Count Matrix:")
# print(count_df)


#### TF-IDF

In [123]:
def makeIdfDict(docs, set_of_words):
    
    idf_dict = dict()
    corpus_size = len(docs)

    for word in set_of_words:
        
        count = 0
        
        for doc in docs:
            
            if(word in doc.split(" ")):
                count += 1
        
        idf_dict[word] = corpus_size/count
       
        
    return idf_dict

In [124]:
def tfidfConversion(docs):
    
    """
    input : list of documents (pandas.Series /List)
    function : return tf-idf vectors
    output : numpy.ndarray
    """
    
    corpus_size = len(docs)
    set_of_words = set()
    
    for idx in range(corpus_size):
        
        docs[idx] = textPreprocessing(docs[idx])
        set_of_words = set_of_words.union(set(docs[idx].split(" ")))
    
    vocab_size = len(set_of_words)
    tfidf_mat = np.zeros(shape=(len(docs), vocab_size))
    
    idf_dict = makeIdfDict(docs, set_of_words)
    
    list_of_words = list(set_of_words)
    
    for index in range(corpus_size):
    
        doc_list = docs[index].split(" ")
        
        for idx in range(len(list_of_words)):
            
            tf = doc_list.count(list_of_words[idx])/len(list_of_words[idx])
            idf = np.log2(idf_dict[list_of_words[idx]])
            tfidf_mat[index][idx] = tf * idf
        
    
    return tfidf_mat
    

In [125]:
# tfidfConversion(data_df['Text'])

In [126]:
# vectorizer = TfidfVectorizer()

# tfidf_matrix = vectorizer.fit_transform(data_df['Text'])
# tfidf_array = tfidf_matrix.toarray()

# feature_names = vectorizer.get_feature_names_out()
# tfidf_df = pd.DataFrame(tfidf_array, columns=feature_names)

# print("TF-IDF Matrix:")
# print(tfidf_df)

### Word to Vector

In [168]:
# import gensim.downloader as api

# model = api.load('word2vec-google-news-300')
# similar_words = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

# print(similar_words)


example_corpus = [
    "king is a man",
    "queen is a woman",
    "king rules the kingdom",
    "queen rules the kingdom",
    "king and queen live happily"
]

model = Word2Vec([sentence.split() for sentence in example_corpus], vector_size=50, window=10, min_count=1, sg=1)

similar_words = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

print(similar_words)

[('is', 0.11321540921926498), ('live', 0.045143432915210724), ('rules', 0.04355676844716072), ('kingdom', -0.014120015315711498), ('the', -0.06369484961032867)]


In [169]:
def mean_pooling(text, model):
    
    tokens = nltk.tokenize.word_tokenize(text.lower())
    text_vector = np.zeros(model.vector_size)
    word_count = 0
    for word in tokens:
        if word in model.wv:
            text_vector += model.wv[word]
            word_count += 1
    if word_count > 0:
        text_vector /= word_count
    return text_vector


In [152]:

model = Word2Vec(sentences = data_df['Text'], vector_size=5 , window = 3, min_count = 1, sg = 0)

#print(model.wv)

In [170]:
# for text in data_df['Text']:
    
#     print(f"{text} : {mean_pooling(text, model)}")

### Doc2Vec

In [172]:

# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]


tokenized_corpus = [doc.split() for doc in corpus]

tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(tokenized_corpus)]

model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

new_doc = "This is a new document."
inferred_vector = model.infer_vector(new_doc.split())

print("Inferred Vector for the New Document:")
print(inferred_vector)


Inferred Vector for the New Document:
[-0.00956134 -0.00233659  0.01236907  0.00481267  0.02602485 -0.02370227
 -0.02823434  0.0021658   0.02366791  0.00902227  0.01244532 -0.01504472
 -0.03087573 -0.0311837  -0.00608703  0.02688278 -0.02435123 -0.01122472
 -0.02898115  0.01942457]


### Spacy

In [178]:
nlp = spacy.load("en_core_web_sm")

text = "When is the session going to end again ?"

doc = nlp(text)

text_vector = doc.vector

print("Text Vector:")
print(text_vector)

Text Vector:
[ 0.22598982  0.01075336  0.3115277  -0.15644164 -0.38841882 -0.19919115
  0.38567078 -0.19412136  0.18805094  0.23093073 -0.09410948 -0.4862024
 -0.18364212  0.07392278 -0.13449486  0.02697322 -0.13064033 -0.37826908
  0.00733031  0.20476244  0.05846053  0.48386016  0.03605304 -0.2037969
  0.02228843  0.41838318  0.60664135  0.30401742  0.07945749  0.8597622
  0.18678263  0.0310543   0.13080072 -0.33917683  0.30802834 -0.17839202
  0.20650297 -0.45914268  0.01723362  0.56884426 -0.06824379  0.07195051
 -0.03867647 -0.07933543  0.7110081  -0.36233246  0.32163197 -0.01028265
  0.6106154  -0.15750289  0.04591224  0.5149987  -0.6774205  -0.22380434
  0.40804994 -0.7133755  -0.3249721   0.35796806  0.07726351 -0.14585829
  0.19366181 -0.2993465  -0.6528384  -0.00424204  0.2618007   0.25462076
  0.29615134 -0.37594736 -0.01486725 -0.18365957  0.320336   -0.40841898
 -0.5275203   0.4867237   0.01186347 -0.4025973  -0.03661235 -0.2070368
 -0.25535536  0.07064598 -0.24805583 -0.01