# Word2Vec for Text Classification

In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.

We will use the sentiment labelled sentences dataset from UCI repository
http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. Let us first combine all the three separate data files into one using the following unix command:

```cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt```

For a pre-trained embedding model, we will use the Google News vectors.
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM

Let us get started!

In [2]:
import pkg_resources

def get_library_versions(library_list):
    frozen_list = []

    for library in library_list:
        try:
            version = pkg_resources.get_distribution(library).version
            frozen_list.append(f"{library}=={version}")
        except pkg_resources.DistributionNotFound:
            print(f"Error: {library} not found or could not retrieve version.")

    return frozen_list

# List of library names
libraries = ["numpy", "pandas", "gensim", "nltk", "scikit-learn", "gdown"]

# Get frozen list of library versions
frozen_versions = get_library_versions(libraries)

# Print the frozen list
for item in frozen_versions:
    print(item)

numpy==1.23.5
pandas==1.5.3
gensim==4.3.1
nltk==3.8.1
scikit-learn==1.2.2
gdown==4.6.6


In [3]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install numpy==1.23.5
!pip install pandas==1.5.3
!pip install gensim==4.3.1
!pip install nltk==3.8.1
!pip install scikit-learn==1.2.2
!pip install gdown==4.6.6

# ===========================



In [4]:
# To install the requirements for the entire chapter, uncomment the lines below and run this cell

# ===========================

# try:
#     import google.colab
#     !curl  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install
# except ModuleNotFoundError:
#     !pip install -r "ch4-requirements.txt"

# ===========================

In [5]:
#basic imports
import warnings
warnings.filterwarnings('ignore')
import os
import gzip
import shutil
from time import time

#pre-processing imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#google-drive download imports
import gdown

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
try:
    from google.colab import files

    # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in "sentiment labelled sentences" folder
    uploaded = files.upload()

    !mkdir DATAPATH
    !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt
    !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt

except ModuleNotFoundError:

    fil = 'sentiment_sentences.txt'

    if not os.path.exists("Data/sentiment_sentences.txt"):
        file = open(os.path.join(path, fil), 'w')
        file.close()

        # combined the three files to make sentiment_sentences.txt
        filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']

        with open('Data/sentiment_sentences.txt', 'w') as outfile:
            for fname in filenames:
                with open('Data/sentiment labelled sentences/' + fname) as infile:
                    outfile.write(infile.read())
        print("File created")
    else:
        print("File already exists")

In [20]:
#Load the pre-trained word2vec model and the dataset

def check_if_file_exists(filename: str, locations: list) -> str :
    for location in locations:
        if os.path.exists(os.path.join(location, filename)):
            return location
    return None

def extract_data(location: str) -> None:
    with gzip.open(os.path.join(location, 'GoogleNews-vectors-negative300.bin.gz'), 'rb') as f_in:
        with open(os.path.join('./Data', './GoogleNews-vectors-negative300.bin'), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

try:
    from google.colab import files
    data_path= "DATAPATH"
    !gdown -O DATAPATH/ https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download
    !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz
    path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'
    training_data_path = "DATAPATH/sentiment_sentences.txt"

except ModuleNotFoundError:

    data_path = './Data/'
    compressed_file_name = 'GoogleNews-vectors-negative300.bin.gz'
    extracted_file_name = 'GoogleNews-vectors-negative300.bin'

    # Check if Extracted File exists
    location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])

    if location_of_extracted_file:
        # Extracted File exists
        path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)

    else:
        location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])

        if location_of_compressed_file:
            # Compressed File exists
            extract_data(os.path.join(location_of_compressed_file))
            path_to_model = os.path.join(data_path, extracted_file_name)

        else:
            # Download File
            output_path = './Data/'
            gdown.download("https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download", output=output_path)

            # Extract File
            extract_data(output_path)

            path_to_model = os.path.join(data_path, extracted_file_name)

    print(f"Data Present at location : {path_to_model}")
    training_data_path = os.path.join(data_path, "sentiment_sentences.txt")


#Load W2V model. This will take some time.
%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('done loading Word2Vec')

#Read text data, cats.
#the file path consists of tab separated sentences and cats.
texts = []
cats = []
fh = open(training_data_path)
for line in fh:
    text, sentiment = line.split("\t")
    texts.append(text)
    cats.append(sentiment)

Downloading...
From: https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/DATAPATH/GoogleNews-vectors-negative300.bin.gz
100% 1.65G/1.65G [00:15<00:00, 103MB/s]
CPU times: user 28.1 s, sys: 3.92 s, total: 32.1 s
Wall time: 34.4 s
done loading Word2Vec


In [21]:
#Inspect the model
word2vec_vocab = w2v_model.key_to_index.keys()
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]
print(len(word2vec_vocab))

3000000


In [22]:
#Inspect the dataset
print(len(cats), len(texts))
print(texts[1])
print(cats[1])

3000 3000
Good case, Excellent value.
1



In [23]:
#preprocess the text.
def preprocess_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further.
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

texts_processed = preprocess_corpus(texts)
print(len(cats), len(texts_processed))
print(texts_processed[1])
print(cats[1])

3000 3000
['good', 'case', 'excellent', 'value']
1



In [24]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this)
        else:
            feats.append(zero_vector)
    return feats


train_vectors = embedding_feats(texts_processed)
print(len(train_vectors))

3000


In [25]:
#Take any classifier (LogisticRegression here, and train/test it like before.
classifier = LogisticRegression(random_state=1234)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)
classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.8013333333333333
              precision    recall  f1-score   support

          0
       0.77      0.83      0.80       353
          1
       0.84      0.78      0.81       397

    accuracy                           0.80       750
   macro avg       0.80      0.80      0.80       750
weighted avg       0.80      0.80      0.80       750



Not bad. With little efforts we got 80% accuracy. Thats a great starting model to have!!