In [1]:
import spacy
import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim
import gensim.downloader

!python3 -m spacy download en_core_web_md

[nltk_data] Downloading package stopwords to /opt/conda/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used


In [2]:
models = list(gensim.downloader.info()['models'].keys())
for model in models:
    print(model)
    
# the trained word2vec models in gensim

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [3]:
# I use the glove-twitter-200 and glove-wiki-gigaword-300 model
# many models are too large to load

model= gensim.downloader.load('glove-twitter-200')

nlp = spacy.load("en_core_web_md")
df1 = pd.read_csv('WikiLarge_Test.csv')

In [4]:
# tokenize the sentences first

lst1=[]

for i in range(len(df1)):
    doc = nlp(df1['original_text'][i])
    tokens = []
    for token in doc:
        tokens.append(token.text)
    lst1.append(tokens)

df1['tokens'] = pd.Series(lst1)

In [5]:
def generate_dense_features(tokenized_texts, vec_model): 

    lst = []

    # iterate through all the tokenized lists
    for l in tokenized_texts:
        lst1=[]
        
        # for each of the tokenized sentence list, iterate through the tokens (words)
        # filter the words that are in the trained model
        for word in l:
            if word.lower() in vec_model.key_to_index:
                lst1.append(word.lower())   
                
        # generate avearge dense vector based on the filtered words
        if len(lst1) > 0:
            vec = np.mean(vec_model[lst1],axis=0)
            lst.append(vec)
            
        # append a zeros vector if no word is in the trained model
        else: 
            vec = np.zeros(vec_model.vector_size)
            lst.append(vec)
    
    return np.array(lst)

In [6]:
X_new = generate_dense_features(df1.tokens , model)
X_new.shape

# X_new is an numpy array, each row is the average dense vector of the sentence in the original train data

(416768, 200)


In [7]:
df = pd.DataFrame(X_new)
df.to_csv('twitter_vec_dense.csv',index=False)