# Imports

In [11]:
from pprint import pprint
import pandas as pd 
import numpy as np

import collections

# Data

## Read Data

In [4]:
DATASET_FOLDER = "/mnt/Study_Storage/Documents/Datasets/Text/amazon-fine-food-reviews"

data = pd.read_csv(DATASET_FOLDER + "/Reviews.csv")

text = np.array(data['Text'])
del data

## Explore dataset

In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [33]:
data.describe()  

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


In [34]:
print("shape of matrix ", data.shape)

shape of matrix  (568454, 10)


## Insights into dataset

since the dataset is seemingly big. matrix of around 6lakhs and 10 attributes.

and our target is to extract only the text. 

>
> **it is better that we perform the trails on a subset of data and then scale that to whole data**

## Methods on big data

**Issue 1**:
 - data is big, so there will be issue of memory allocation - will need to use a runtime generator

In [83]:
def getCorpora(data):
    temp_corp = "".join(t.lower()
                        for t in data if t not in list("$#!@^&*+:;<>=."))
    return temp_corp.split()


# def getVocublary(corpora):
#     vocub = []

#     for value in corpora:
#         if value not in vocub:
#             vocub.append(value)
#     return vocub


# def getDictionary(corpora):
#     dictionary = {}

#     label = 0
#     for d in corpora:
#         if d not in dictionary:
#             dictionary[d] = label
#             label += 1

#     return dictionary


# def textToSeq(data, dictionary):
#     seq = [dictionary[word] for word in data if word in dictionary]
#     return seq

def oneHot(data:list, length_vocub:int)->'array':
    if type(data) is int:
        data = [data]
    matrix = np.zeros([len(data), length_vocub])

    for i, value in enumerate(data):
        matrix[i][value] = 1

    return matrix


# we restrict our vocabulary size to 50000
vocabulary_size = 50000


def build_dataset(words):
    count = [['UNK', -1]]
    # Gets only the vocabulary_size most common words as the vocabulary
    # All the other words will be replaced with UNK token

    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()

    # Create an ID for each word by giving the current length of the dictionary
    # And adding that item to the dictionary

    for word, _ in count:
        dictionary[word] = len(dictionary)

    data = list()
    unk_count = 0

    # Traverse through all the text we have and produce a list
    # where each element corresponds to the ID of the word found at that index

    for word in words:
        # If word is in the dictionary use the word ID,
        # else use the ID of the special token "UNK"

        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)

    # update the count variable with the number of UNK occurences
    count[0][1] = unk_count

    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    # Make sure the dictionary is of size of the vocabulary
    assert len(dictionary) == vocabulary_size

    return data, count, dictionary, reverse_dictionary

In [9]:
corpora = getCorpora(text)

In [12]:

data, count, dictionary, reverse_dictionary = build_dataset(corpora)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])

del corpora  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 1452956], ('the', 1805955), ('and', 1259604), ('i', 1255528), ('a', 1187364)]
Sample data [3, 16, 115, 269, 6, 1, 7663, 467, 99, 68]


In [14]:
print("no of data points are ", len(data))
print("size of dictionary is ", len(dictionary))

print("reverse value of 1 ", reverse_dictionary[1])
print("code for UNK keyword ", dictionary['UNK'])

no of data points are  45057952
size of dictionary is  50000
reverse value of 1  the
code for UNK keyword  0


In [24]:
temp = oneHot(data[0], vocabulary_size)
print(temp)

## Generate batches of data for skip gram

In [45]:
window_size = 4
batch_size = 128
batch_index_completed = window_size 

def batch_generator(batch_size = 256, window_size = 4):
    global batch_index_completed
    
    start = batch_index_completed
    end = start+batch_size 
    
    if (abs(len(data) - end) < batch_size) or ((len(data) - start) < window_size):
        batch_index_completed = window_size
        start = batch_index_completed
        end = start + batch_size

    batch = []
    for i in range(start,end):
        target = data[i]
        contextL= data[i-window_size:i]
        contextR = data[i+1:i+window_size+1] 
        contextL.extend(contextR)
        
        temp = [[target, c] for c in contextL]
        batch.extend(temp)
    
    batch_index_completed += batch_size 
    
    return np.array(batch)

In [87]:
batch = batch_generator(128,4)

In [88]:
X = batch[:,0]
Y = batch[:,1]

print(X.shape, Y.shape)

(1024,) (1024,)


In [89]:
X_one_hot = oneHot(X, length_vocub= vocabulary_size)
print(X_one_hot.shape)

(1024, 50000)


In [90]:
Y_one_hot = oneHot(Y, length_vocub= vocabulary_size)
print(Y_one_hot.shape)

(1024, 50000)
