In [1]:
import pandas as pd 
import numpy as np
import regex as re 
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./yelp_train.csv')
data

Unnamed: 0,class_index,review_text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


In [3]:
data['rating'] = data['class_index']
data

Unnamed: 0,class_index,review_text,rating
0,1,"Unfortunately, the frustration of being Dr. Go...",1
1,2,Been going to Dr. Goldberg for over 10 years. ...,2
2,1,I don't know what Dr. Goldberg was like before...,1
3,1,I'm writing this review to give you a heads up...,1
4,2,All the food is great here. But the best thing...,2
...,...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...,2
559996,2,Professional \nFriendly\nOn time AND affordabl...,2
559997,1,Phone calls always go to voicemail and message...,1
559998,1,Looks like all of the good reviews have gone t...,1


In [4]:
data.rating.replace({1:"negative", 2:"positive"}, inplace=True)
data

Unnamed: 0,class_index,review_text,rating
0,1,"Unfortunately, the frustration of being Dr. Go...",negative
1,2,Been going to Dr. Goldberg for over 10 years. ...,positive
2,1,I don't know what Dr. Goldberg was like before...,negative
3,1,I'm writing this review to give you a heads up...,negative
4,2,All the food is great here. But the best thing...,positive
...,...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...,positive
559996,2,Professional \nFriendly\nOn time AND affordabl...,positive
559997,1,Phone calls always go to voicemail and message...,negative
559998,1,Looks like all of the good reviews have gone t...,negative


In [5]:
vocab = {}

In [6]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t']  = {}
    vocab['unkToken'] = unkToken
    idx = addToken(unkToken)
    vocab['unkTokenIdx'] = idx

In [7]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx]=token
    return idx

In [8]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [9]:
def lookUpToken(token):
    if vocab['unkTokenIdx']>=0:
        return vocab['t_2_i'].get(token, vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [10]:
def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError('the index (%d) is not there' % idx)
    return vocab['i_2_t'][idx]

In [11]:
def vocabularyFromDataFrame(data, cutoff = 25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in data.review_text:
        for word in re.split('\W+', r):
            wordCounts[word] += 1
    for word, count in wordCounts.items():
        if count > cutoff:
            addToken(word)

#### Loading the dataset again to build the vocabulary

In [12]:
data = pd.read_csv('./yelp_train.csv')

In [13]:
vocabularyFromDataFrame(data)

In [14]:
lookUpToken('the')

2

In [15]:
lookUpIndex(2)

'the'

In [16]:
len(vocab['t_2_i'])

36915

#### Changing the cutoff value may change the above number.

### Building Vocabulary from corpus:

#### Writing a Generic function

In [17]:
def vocabularyFromCorpus(data, cutoff = 25):
    initializeVocabulary()
    wordCounts = Counter()
    for doc in Corpus:
        for word in re.split('\W+', doc):
            wordCounts[word] += 1
    for word, count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [18]:
Corpus = np.asarray(data.review_text)
vocabularyFromCorpus(Corpus)

#### One Hot Encoding

In [19]:
def oneHotVector(token, N):
    oneHot  = np.zeros((N, 1))
    oneHot[lookUpToken(token)] = 1
    return oneHot

In [20]:
N = len(vocab['t_2_i'])
token = 'the'
oneHot = oneHotVector(token, N)

In [21]:
oneHot

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [22]:
oneHot[2]

array([1.])