In [25]:
# ! pip install --upgrade numpy
# ! pip install gensim

In [24]:
# Load pre-trained Word2Vec model.
# w2v_model = gensim.models.Word2Vec.load("./results/model.w2v")


# def vector(word, w2v_model):
#     returan w2v_model.wv.key_to_index[word]


import gensim
import numpy as np  
import pandas as pd

# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
TRAIN_SIZE = 0.8

# Load pre-trained Word2Vec model.


def train_w2v_model(docs, vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, epochs=W2V_EPOCH):
    w2v_model = gensim.models.word2vec.Word2Vec(vector_size=vector_size,
                                                window=window,
                                                min_count=min_count,
                                                workers=10)
    tokenized_docs = [doc.split() for doc in docs]
    w2v_model.build_vocab(tokenized_docs)

    vocab_size = len(w2v_model.wv)
    print("Vocab size", vocab_size)
    w2v_model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=epochs)

    return w2v_model


def create_document_vector(tokens, size, model_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec


def create_documents_vectors(documents, w2v_model, size=W2V_SIZE):
    TOTAL_DOCS = documents.shape[0]
    wordvec_arrays = np.zeros((TOTAL_DOCS, size))

    for i in range(TOTAL_DOCS):
        wordvec_arrays[i, :] = create_document_vector(documents[i], size, w2v_model)
    wordvec_df = pd.DataFrame(wordvec_arrays)
    return wordvec_df


In [68]:
from gensim.models.doc2vec import TaggedDocument


def add_label(documents):
    output = []
    tokenized_docs = [doc.split() for doc in documents]

    for i, s in zip(list(documents.index), tokenized_docs):
        output.append(TaggedDocument(s, ["tweet_" + str(i)]))
    return output

def train_doc_model(documents, vector_size=200):
    TOTAL_DOCS = len(documents)
    print('TOTAL Docs v1')
    model_d2v = gensim.models.Doc2Vec(dm=0,  # dm = 1 for ‘distributed memory’ model
                                    dm_mean=1,  # dm_mean = 1 for using mean of the context word vectors
                                    vector_size=vector_size,  # no. of desired features
                                    window=5,  # width of the context window
                                    negative=6,  # if > 0 then negative sampling will be used
                                    # Ignores all words with total frequency lower than 5.
                                    min_count=3,
                                    workers=32,  # no. of cores
                                    alpha=0.1,  # learning rate
                                    seed=23,  # for reproducibility
                                    )
    labeled_tweets = add_label(documents)  # label all the tweets
    model_d2v.build_vocab([i for i in labeled_tweets])
    model_d2v.train(labeled_tweets, total_examples=TOTAL_DOCS, epochs=25)
    return model_d2v

def create_documents_vectors_from_Doc2Vec(documents, model_d2v, vector_size=200):
    TOTAL_DOCS = len(documents)
    docvec_arrays = np.zeros((TOTAL_DOCS, vector_size))
    for i in range(TOTAL_DOCS):
        docvec_arrays[i, :] = model_d2v.docvecs[i].reshape((1, vector_size))
    return pd.DataFrame(docvec_arrays)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer


def to_tf_idf_vector(df):
    tfidf = TfidfVectorizer(lowercase=False, max_df=0.8, min_df=0.01, ngram_range=(
        1, 3))  # Most of the terms are eliminated by min_df
    vectors = tfidf.fit_transform(df['text']).toarray()  # 2
    return vectors

In [22]:
# import os
# import pandas as pd
# %run ./preprocess.ipynb

# ds_root = '/home/gaurav.gupta/projects/PoCs/brandMention/brand_datasets/'
# r_path = os.path.join(ds_root, 'ds_complaints',
#                       'panasonic_random_sample_predicted.csv')
# r_df = read_file(r_path)

# m_path = os.path.join(ds_root, 'ds_complaints', 'panasonic_v1_g.csv')
# m_df = read_file(m_path)

# raw_df = pd.concat([r_df, m_df])
# raw_df.reset_index(drop=True, inplace=True)
# df = process_data(raw_df)


In [23]:
# w2v_model = train_w2v_model(df)
# create_document_vector()