In [2]:
import numpy as np
from numpy.random import RandomState
import os.path
import re
import itertools

from gensim.corpora.dictionary import Dictionary

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/faridbala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:


def load_data(name, seed=1234):
    """
    Load one of MR, CR, SUBJ or MPQA
    """
    z = {}
    if name == 'rt-polarity':
        pos, neg = load_rt()
    elif name == 'Subjectivity_datasets':
        pos, neg = load_subj()
    elif name == 'review_polarity':
        pos, neg = load_polarity()
#     elif name == 'MPQA':
#         pos, neg = load_mpqa(loc=loc)

    labels = compute_labels(pos, neg)
    text, labels = un_shuffle_data(pos+neg, labels, seed=seed)
    z['text'] = text
    z['labels'] = labels
#     print("Type of text is {}".format(text))
#     features = encoder.encode(text)

    return z#, features


def load_rt(loc = './dataset/Sentiment_polarity_datasets/rt_polaritydata/rt-polaritydata/'):
    """
    Load the rt-polarity dataset
    """
    pos, neg = [], []
    with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f:
        for line in f:
            pos.append(line.decode('latin-1').strip())
    with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f:
        for line in f:
            neg.append(line.decode('latin-1').strip())
    return pos, neg


def load_subj(loc='./dataset/Subjectivity_datasets/rotten_imdb/'):
    """
    Load the Subjectivity_datasets dataset
    """
    pos, neg = [], []
    with open(os.path.join(loc, 'plot.tok.gt9.5000'), 'rb') as f:
        for line in f:
            pos.append(line.decode('latin-1').strip())
    with open(os.path.join(loc, 'quote.tok.gt9.5000'), 'rb') as f:
        for line in f:
            neg.append(line.decode('latin-1').strip())
    return pos, neg


def load_polarity(loc='./dataset/Sentiment_polarity_datasets/review_polarity/txt_sentoken/'):
    """
    Load the review_polarity dataset
    """
    pos, neg = [], []
    temp_path = os.path.join(loc, 'pos')
    for file in os.listdir(temp_path):
        with open(os.path.join(temp_path, file), 'rb') as f:
            for line in f:
                text = line.decode("utf-8").strip()
                if len(text) > 0:
                    pos.append(text)
    temp_path = os.path.join(loc, 'neg')
    for file in os.listdir(temp_path):
        with open(os.path.join(temp_path, file), 'rb') as f:
            for line in f:
                text = line.decode("utf-8").strip()
                if len(text) > 0:
                    neg.append(text)
    return pos, neg


# def load_mpqa(loc='./data/'):
#     """
#     Load the MPQA dataset
#     """
#     pos, neg = [], []
#     with open(os.path.join(loc, 'mpqa.pos'), 'rb') as f:
#         for line in f:
#             text = line.strip()
#             if len(text) > 0:
#                 pos.append(text)
#     with open(os.path.join(loc, 'mpqa.neg'), 'rb') as f:
#         for line in f:
#             text = line.strip()
#             if len(text) > 0:
#                 neg.append(text)
#     return pos, neg


def compute_labels(pos, neg):
    """
    Construct list of labels
    """
    labels = np.zeros(len(pos) + len(neg))
    labels[:len(pos)] = 1.0
    labels[len(pos):] = 0.0
    return labels


def un_shuffle_data(X, L, seed=1234):
    """
    Shuffle the data
    """
#     prng = RandomState(seed)
    inds = np.arange(len(X))
#     prng.shuffle(inds)
    X = [X[i] for i in inds]
    L = L[inds]
    return (X, L) 

In [5]:
text_review_polarity = load_data('review_polarity')['text']
label_review_polarity = load_data('review_polarity')['labels']

text_rt_polarity = load_data('rt-polarity')['text']
label_rt_polarity = load_data('rt-polarity')['labels']

text_Subjectivity_datasets = load_data('Subjectivity_datasets')['text']
label_Subjectivity_datasets = load_data('Subjectivity_datasets')['labels']

In [6]:
text_review_polarity[0]

'the dream team is a thoroughly entertaining comedy featuring four loveable characters who just happen to be slightly insane .'

In [7]:
import gensim
import torch
import torch.nn as nn


model = gensim.models.KeyedVectors.load_word2vec_format('./gensim-develop/gensim/test/test_data/word2vec_pre_kv_c',
                                                        binary=True)    
# './word2vec_pre_kv_c'  from :   https://github.com/RaRe-Technologies/gensim/tree/develop/gensim/test/test_data
weights = torch.FloatTensor(model.vectors) # formerly syn0, which is soon deprecated
embedding = nn.Embedding.from_pretrained(weights)

# Get embeddings for index 1
input = torch.LongTensor([1])
embedding(input)

tensor([[4.4150e-05, 1.6970e-07, 1.0140e-08, 1.6782e-07, 6.2983e-10, 4.4157e-05,
         1.4927e-19, 1.6223e-07, 4.1724e-08, 2.5193e-09]])

In [8]:
def sent_to_tok(txt):

    tokens = word_tokenize(txt)
    
# @@@@@@@@ if wanna remove remaining tokens that are not alphabetic uncomment line belove
    
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

In [9]:
dummy_tokens = sent_to_tok(text_review_polarity[42124])
len(dummy_tokens)

152

# Define a Vocabulary

In [10]:
def build_dictionary(name):
    words = []
    text = load_data(name)['text']
    for txt in text:
        tokens = sent_to_tok(txt)
        for tok in tokens:
            words.append(tok)
    return words
    

In [11]:
# build_dictionary('rt_polarity')

In [12]:
review_polarity_dic = build_dictionary('review_polarity')
rt_polarity_dic = build_dictionary('rt-polarity')
Subjectivity_datasets_dic = build_dictionary('Subjectivity_datasets')

In [13]:
dict_ = review_polarity_dic + rt_polarity_dic + Subjectivity_datasets_dic

dictionary = list(set(dict_))     #  44432 unique words out of 1675838 words in whole dataset

In [14]:
len(dictionary)

44432

In [15]:
def encoder(txt):
    dec = []
    for w in txt:
        code = dictionary.index(w)
        dec.append(code)
    return np.asarray(dec)

In [16]:
dummy_array = encoder(dummy_tokens)
dummy_array

array([11461, 36070, 40415, 25768, 15897,  2254, 22615, 32514, 39627,
       14144, 31202, 17625,   777, 44014, 27136, 14283, 35855, 32244,
       16037,  3455, 22811, 32514, 36227, 17730, 24293, 32514, 15615,
       34511, 22615,  3238, 14283, 32514,  1980, 40471, 24137, 20624,
       28913, 40070, 27622,    59, 32553, 22495,  1794,  8326,  7946,
        7467, 19267, 16083, 21511, 17823, 11475, 22615,  6946, 24505,
       24763, 14109, 17827, 40892, 27136, 32514, 38606, 17270,  3455,
       20445,  7834, 20655, 24763, 21175, 11480, 32479, 29246, 18191,
       29657,  6298, 42247, 40892,  6188, 34143,  3455,  5835, 14283,
       15859, 37359, 30113, 17823,  2769, 24763, 14109, 39156, 36070,
       16539,  1980, 36234, 32244,  3023,  7794, 36234,  7946, 12447,
       19303, 32699, 28937, 22615, 10837, 17823, 38624, 23089, 35101,
       24763,  7131, 29657, 43271,  6188, 32514, 21161,  5835, 14506,
       27136, 15859,  3023, 17823,  6203, 27622, 17730, 24234, 16040,
       27622, 22615,

In [17]:
# len_ = []
# jj = 0
# for i, j in enumerate(text_Subjectivity_datasets):
#     if len(j)> jj:
#         jj = len(j)
#         print(j)
#         print(i)

In [18]:
text_review_polarity[42124]

'since that plot is incredibly lame and a track record of what goes on with it wouldn\'t be able to carry a commercial let alone a feature film , and because it\'s a party , there are some more main characters , such as : william ( charlie korsmo , finally surfacing after " dick tracy " ) , the nerd ( and his dominions ) who has come up with a ridiculous plan to publically sabotage mike , who\'s humiliated him for years , but gets too caught up in drinking to do it ; kenny ( seth green ) , the wigger , who has decided that this party will be where he will finally get laid ( uh huh ) ; and denise , the only exceptional character , who unfortunately gets stuck in a bathroom ( don\'t ask ) with kenny where the two characters let down their characters and are allowed to follow the laws of plot cliches from point a to point b with nary a bit of characterization involved after awhile .'

In [19]:
len(text_review_polarity[42124])     # max len in whole dataset without prepross

887

In [20]:
largest_sentence = len(sent_to_tok(text_review_polarity[42124]))   # max len in whole dataset without prepross
largest_sentence

152