In [1]:
import glob
import numpy as np
import pandas as pd
import string
from Config import *

In [2]:
def get_file_names():
    files = [f for f in glob.glob(CSV_DATA_PATH+"*.csv")]
    file_list = []
    for file in files:
        file_list.append(file)
    return file_list

In [3]:
def process_raw_to_hd():
    filenames = get_file_names()
    main_df = pd.DataFrame()
    for i in filenames:
        df = pd.read_csv(i)
        df['text'] = df['text'].apply(str) 
        main_df = pd.concat([main_df , df], ignore_index=True)
    if not os.path.exists(PROCESSED_DATA_PATH):
        os.mkdir(PROCESSED_DATA_PATH)
    main_df.to_hdf(PROCESSED_DATA_PATH + RAW_DATA, key='raw', append=True, format='t', min_itemsize={'text': 4096})
    return main_df

In [4]:
def clean_text(line):
    # Converting to lower
    line = line.lower()

    # Removing alphanumerics
    tokens = [word for word in line.split() if word.isalpha()]

    # Removing Punctuations
    translator = str.maketrans("", "", string.punctuation)
    tokens = [word.translate(translator) for word in tokens]

    # Removing stop_words
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stop_words]

    # Removing short_words
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [5]:
process_raw_to_hd()
df = pd.read_hdf(PROCESSED_DATA_PATH + RAW_DATA)
df = df.dropna()
df['text'] = df['text'].apply(clean_text)
df = df.dropna()
df.to_hdf(PROCESSED_DATA_PATH + CLEAN_DATA, key='clean')
df = pd.read_hdf(PROCESSED_DATA_PATH + CLEAN_DATA,key='clean')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys
be ready to see PyTables asking for *lots* of memory and possibly slow
I/O.  You may want to reduce the rowsize by trimming the value of
dimensions that are orthogonal (and preferably close) to the *main*
dimension of this leave.  Alternatively, in case you have specified a
very small/large chunksize, you may want to increase/decrease it.
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['file', 'label', 'text']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [6]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

  from pandas import Panel


In [13]:
def labelize_sentences_ug(sent,label):
    result = []
    prefix = label
    for i, t in zip(sent.index, sent):
        result.append(TaggedDocument(t, [prefix + '_%s' % i]))
    return result

In [14]:
all_x = df.text
all_x_w2v = labelize_sentences_ug(all_x, 'all')

In [16]:
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])




100%|██████████| 25952/25952 [00:00<00:00, 2527705.39it/s]


In [17]:
%%time
for epoch in range(30):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha




100%|██████████| 25952/25952 [00:00<00:00, 2489435.73it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2613710.26it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2633694.11it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2619560.02it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2585892.94it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2622715.89it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2682965.11it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2640402.12it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2498521.26it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2619244.85it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2674329.94it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2547284.88it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2596750.26it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2660732.76it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2553858.98it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2613773.02it/s]



100%|

CPU times: user 18.2 s, sys: 213 ms, total: 18.5 s
Wall time: 5.63 s


In [18]:
model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])




100%|██████████| 25952/25952 [00:00<00:00, 2437699.09it/s]


In [19]:
%%time
for epoch in range(30):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha




100%|██████████| 25952/25952 [00:00<00:00, 2481547.00it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2593162.22it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2616411.73it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2179539.81it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2681048.70it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2511260.29it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2623284.75it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2546808.08it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2591063.49it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2605140.31it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2600161.89it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2547582.97it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2608261.51it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2565717.79it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2660862.85it/s]



100%|██████████| 25952/25952 [00:00<00:00, 2679860.59it/s]



100%|

CPU times: user 43.5 s, sys: 309 ms, total: 43.8 s
Wall time: 7.11 s


In [20]:
model_ug_cbow.save('w2v_model_ug_cbow.word2vec')
model_ug_sg.save('w2v_model_ug_sg.word2vec')

In [21]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec')

In [22]:
len(model_ug_cbow.wv.vocab.keys())

6704