# Step2. Extract Features

This Jupyter notebook demonstrates how to convert the processed data into TF-IDF matrix and train a word2vec model with the processed data.

In [1]:
import numpy as np
import re
import ast
import pickle
from os.path import exists
import nltk
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [2]:
VECTOR_SIZE = 100 
WINDOW_SIZE = 5
MIN_COUNT = 20
SG = 1
NEGATIVE = 20
MIN_DF =  20
MAX_DF = 0.8
NORM_FUNCTION = 'l1'

In [3]:
INPUT_PATH = '../data/processed_data/'
INPUT_FILE = 'processed_data.tsv'
OUTPUT_PATH = '../data/extracted_features/'

## Read prossed data

In [4]:
# read input data
input_df = pd.read_csv(INPUT_PATH + INPUT_FILE, sep='\t', encoding='utf-8')

## Convert article data to TF-IDF matrix

In [5]:
def write_object(obj, output_fname):
    f = open(output_fname, 'wb')
    pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [6]:
def data_vectorization(PATH, training_docs, MIN_DF, MAX_DF, NORM_FUNCTION):
    
    V_fname = PATH + "covid_tfidf_v.pkl"
    D_fname = PATH + "covid_tfidf_d.pkl"
    
    if exists(V_fname):
        print("File {} already exist".format(V_fname))
        V = pickle.load( open(V_fname, "rb") )
        D = pickle.load( open(D_fname, "rb") )
    else:
        print('TfidfVectorizer is proceed')
        V = TfidfVectorizer(analyzer='word', 
                            min_df=MIN_DF, 
                            max_df=MAX_DF, 
                            norm=NORM_FUNCTION, 
                            encoding='utf-8') # Term Frequency times inverse document frequency.
        
        D = V.fit_transform(training_docs)
        
        write_object(V, V_fname)
        write_object(D, D_fname)

    return V, D

In [7]:
training_docs = list(input_df['text'])
V, D = data_vectorization(OUTPUT_PATH, training_docs, MIN_DF, MAX_DF, NORM_FUNCTION)
print("Matrix shape:", D.shape)

TfidfVectorizer is proceed
Matrix shape: (557956, 10989)


## Train Word2vec model

In [8]:
def read_corpus(df):
    token_list = []
    for index, row in df.iterrows():
        token_list.append(row['text'].split(' '))
    training_docs = np.asarray(token_list)

    return training_docs

In [9]:
corpus = read_corpus(input_df)

text_filename = "covid_" + str(VECTOR_SIZE) + "d.txt"
model_filename = "covid_" + str(VECTOR_SIZE) + "d.model"

model = Word2Vec(corpus, 
                 size=VECTOR_SIZE, 
                 window=WINDOW_SIZE, 
                 min_count=MIN_COUNT, 
                 sg=SG, 
                 negative=NEGATIVE)

# save the trained model
model.wv.save_word2vec_format(OUTPUT_PATH + text_filename, binary=False)
model.save(OUTPUT_PATH + model_filename)