#### One-Hot Encoding



In [1]:
documents= ['Human love dogs.','Dogs are intelligent.', 'Dog eats meat','Human love meat']
processed_docs=[doc.lower().replace('.','') for doc in documents]
print(processed_docs)

['human love dogs', 'dogs are intelligent', 'dog eats meat', 'human love meat']


In [2]:
vocab= {}
count=0

for doc in processed_docs:
  for word in doc.split():
    if word not in vocab:
      count= count+1
      vocab[word]= count

print(vocab)

{'human': 1, 'love': 2, 'dogs': 3, 'are': 4, 'intelligent': 5, 'dog': 6, 'eats': 7, 'meat': 8}


In [3]:
def hot_encode(input_string):
  onehot_encode=[]
  for word in input_string.split():
    temp=[0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1]=1
    onehot_encode.append(temp)

  return onehot_encode


In [4]:
print(processed_docs[1])
hot_encode(processed_docs[1])

dogs are intelligent


[[0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0]]

In [5]:
from sklearn.preprocessing import OneHotEncoder

data = [doc.lower().replace('.','').split() for doc in documents]
print(data)
one_hot_encoder = OneHotEncoder()
encoded_text= one_hot_encoder.fit_transform(data).toarray()


print('Onehot encoded matrix', encoded_text )

[['human', 'love', 'dogs'], ['dogs', 'are', 'intelligent'], ['dog', 'eats', 'meat'], ['human', 'love', 'meat']]
Onehot encoded matrix [[0. 0. 1. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 1. 0. 0. 1.]]


#### Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
bow = count_vect.fit_transform(processed_docs)

In [7]:
count_vect.vocabulary_

{'human': 4,
 'love': 6,
 'dogs': 2,
 'are': 0,
 'intelligent': 5,
 'dog': 1,
 'eats': 3,
 'meat': 7}

In [8]:
temp = count_vect.transform(["dogs are loyal to human"])
temp.toarray()

array([[1, 0, 1, 0, 1, 0, 0, 0]])

In [9]:
#BoW with binary vectors
count_vect = CountVectorizer(binary=True)
count_vect.fit(processed_docs)

CountVectorizer(binary=True)

#### Bag of N-Grams

In [10]:
count_vect = CountVectorizer(ngram_range=(1,3))
bow = count_vect.fit_transform(processed_docs)

print(' vocab = ', count_vect.vocabulary_)

 vocab =  {'human': 10, 'love': 15, 'dogs': 5, 'human love': 11, 'love dogs': 16, 'human love dogs': 12, 'are': 0, 'intelligent': 14, 'dogs are': 6, 'are intelligent': 1, 'dogs are intelligent': 7, 'dog': 2, 'eats': 8, 'meat': 18, 'dog eats': 3, 'eats meat': 9, 'dog eats meat': 4, 'love meat': 17, 'human love meat': 13}


In [11]:
temp = count_vect.transform(["dogs are loyal to human"])
temp.toarray()

array([[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

Note that the number of features (and hence the size of the feature vector) increased a lot for the same data, compared to the ther single word based representations!!

#### TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf= TfidfVectorizer()

bow_tfid = tfidf.fit_transform(processed_docs)

print('Words in vocabulary ', tfidf.get_feature_names_out())
print('IDF for all words in vocab ', tfidf.idf_)


print('TFIDF representation for all documents in our corpus',bow_tfid.toarray())

temp = tfidf.transform(["dogs are loyal to human"])
print('temp representation ' ,temp.toarray())


Words in vocabulary  ['are' 'dog' 'dogs' 'eats' 'human' 'intelligent' 'love' 'meat']
IDF for all words in vocab  [1.91629073 1.91629073 1.51082562 1.91629073 1.51082562 1.91629073
 1.51082562 1.51082562]
TFIDF representation for all documents in our corpus [[0.         0.         0.57735027 0.         0.57735027 0.
  0.57735027 0.        ]
 [0.61761437 0.         0.48693426 0.         0.         0.61761437
  0.         0.        ]
 [0.         0.61761437 0.         0.61761437 0.         0.
  0.         0.48693426]
 [0.         0.         0.         0.         0.57735027 0.
  0.57735027 0.57735027]]
temp representation  [[0.66767854 0.         0.52640543 0.         0.52640543 0.
  0.         0.        ]]


### Word2Vec

Using a pre-trained Word2vec model for doing feature extraction and performing text classification.

sentiment labelled sentences dataset from UCI repository http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences is used.

The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. 

For a pre-trained embedding model, Google News vectors. https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM is used

In [13]:
!pip install wget

#basic imports
import warnings
warnings.filterwarnings('ignore')
import os
import wget
import gzip
import shutil
from time import time

#pre-processing imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=942c56ade8e1f7cf79f78e04389824185d857504bd27c9c1347a53ad1a698674
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
#Load the pre-trained word2vec model and the dataset
    
from google.colab import files
data_path= "DATAPATH"
!wget -P DATAPATH https://text-segmentation-wordembedding.s3.amazonaws.com/GoogleNews-vectors-negative300.bin.gz
!gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz      
path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'
training_data_path = "sentiment_sentences.txt"
      
#Load W2V model. This will take some time. 
%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('done loading Word2Vec')

#Read text data, cats.
#the file path consists of tab separated sentences and cats.
texts = []
cats = []
fh = open(training_data_path)
for line in fh:
    text, sentiment = line.split("\t")
    texts.append(text)
    cats.append(sentiment)

--2022-10-10 14:23:37--  https://text-segmentation-wordembedding.s3.amazonaws.com/GoogleNews-vectors-negative300.bin.gz
Resolving text-segmentation-wordembedding.s3.amazonaws.com (text-segmentation-wordembedding.s3.amazonaws.com)... 54.231.172.81
Connecting to text-segmentation-wordembedding.s3.amazonaws.com (text-segmentation-wordembedding.s3.amazonaws.com)|54.231.172.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’


2022-10-10 14:24:13 (43.8 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]

CPU times: user 39.3 s, sys: 5.18 s, total: 44.5 s
Wall time: 44.8 s
done loading Word2Vec


In [15]:
#Inspect the model
word2vec_vocab = w2v_model.vocab.keys()
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]
print(len(word2vec_vocab))

3000000


In [16]:
#Inspect the dataset
print(len(cats), len(texts))
print(texts[1])
print(cats[1])

3000 3000
Good case, Excellent value.
1



In [17]:
#preprocess the text.
def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

texts_processed = preprocess_corpus(texts)
print(len(cats), len(texts_processed))
print(texts_processed[1])
print(cats[1])

3000 3000
['good', 'case', 'excellent', 'value']
1



In [18]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    i=0
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this) 
        else:
            feats.append(zero_vector)
        i+=1
    return feats


train_vectors = embedding_feats(texts_processed)
print(len(train_vectors))

3000


In [19]:
#Training logisticRegression 
classifier = LogisticRegression(random_state=1234)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)
classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.8106666666666666
              precision    recall  f1-score   support

          0
       0.82      0.80      0.81       374
          1
       0.81      0.82      0.81       376

    accuracy                           0.81       750
   macro avg       0.81      0.81      0.81       750
weighted avg       0.81      0.81      0.81       750

