In [1]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf
import re

In [46]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence, hashing_trick

In [3]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

In [6]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 999

##### Setting Google News Word2vec model path

In [4]:
google_model_path = '../../../LearnSpace/GoogleNews-vectors-negative300.bin.gz'

##### Loading Google News W2V model

In [43]:
google_w2v = gensim.models.KeyedVectors.load_word2vec_format(google_model_path, binary=True)

##### Regex way to remove stopwords (faster)

In [5]:
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('English')) + r')\b\s*')

#### Loading sentiment analysis dataframes

In [7]:
train_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')

In [8]:
test_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/test.tsv', sep='\t')

In [44]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [10]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine effort .
1,156062,8545,An intermittently pleasing but mostly routine effort
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


### Checking number of labels

In [None]:
label_set = np.unique(train_sentiment)
labels = len(label_set)

In [None]:
labels

#### Getting the values from the dataframe

In [11]:
train_phrases = train_df['Phrase'].values
train_sentiment = train_df['Sentiment'].values

### Checking word tokenization with NLTK

#### Removing stop words

In [14]:
count = 0
for phrase in train_phrases:
    if count > 10:
        break
    count += 1
    phrase = pattern.sub('', phrase)
    print(word_tokenize(phrase))


['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', ',', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story', '.']
['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose']
['A', 'series']
['A']
['series']
['escapades', 'demonstrating', 'adage', 'good', 'goose']
[]
['escapades', 'demonstrating', 'adage', 'good', 'goose']
['escapades']
['demonstrating', 'adage', 'good', 'goose']
['demonstrating', 'adage']


### Checking Tokenization with Keras

##### Setting parameters

In [13]:
max_words = 10000

In [None]:
text_to_word_sequence("this is a test { }", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

In [33]:
text_to_word_sequence("The cat is in the hat!!!")

['the', 'cat', 'is', 'in', 'the', 'hat']

In [41]:
x_train = []
sentence = []
count = 0 
for phrase in train_phrases:
    if count > 10:
        break
    count += 1
    phrase = pattern.sub('', phrase.lower())
    words = text_to_word_sequence(phrase, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    for word in words:
        
        


### Preprocessing part 2

##### Using the hashing trick

In [78]:
x_train = []
sentence = []
count = 0 
for phrase in train_phrases:
    if count > 10:
        break
    count += 1
    phrase = pattern.sub('', phrase.lower())
    words = text_to_word_sequence(phrase, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
    print(words)
    words = list(map(lambda x: hashing_trick(x, n=1000000), words))
    arr = np.array([int(word) for lists in words for word in lists])
    print(arr)

['series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story']
[470148 269585 450681 710900 368581 746917 849880 368581 691934 276451
 465837  15142 374718 801422 378434]
['series', 'escapades', 'demonstrating', 'adage', 'good', 'goose']
[470148 269585 450681 710900 368581 746917]
['series']
[470148]
[]
[]
['series']
[470148]
['escapades', 'demonstrating', 'adage', 'good', 'goose']
[269585 450681 710900 368581 746917]
[]
[]
['escapades', 'demonstrating', 'adage', 'good', 'goose']
[269585 450681 710900 368581 746917]
['escapades']
[269585]
['demonstrating', 'adage', 'good', 'goose']
[450681 710900 368581 746917]
['demonstrating', 'adage']
[450681 710900]


In [77]:
arr.shape

(2,)

#### Input a sentence

In [None]:
test_np = np.vstack((google_w2v['hello'], google_w2v['how'], google_w2v['are'],google_w2v['you']))

In [None]:
# keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', \
#                       embeddings_regularizer=None, activity_regularizer=None, \
#                       embeddings_constraint=None, mask_zero=False, input_length=None)

In [None]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=300))

#### Freezing the embedding layer

In [None]:
model.layers[0].trainable = False

In [None]:
input_array = test_np

In [None]:
model.compile('rmsprop', 'mse')

In [None]:
output_array = model.predict(input_array)

In [None]:
model.output_shape

In [None]:
output_array

In [None]:
test_np = np.vstack((google_w2v['hello'], google_w2v['hi'], google_w2v['fail'],google_w2v['what']))

In [None]:
test_np.ndim

In [None]:
test_np.shape

In [None]:
np.array([1,2,3]).shape

In [None]:
np.random.randint(1000, size=(5, 10, 2))

In [57]:
hashing_trick("hello this is a these", n=10000, hash_function='md5')

[2969, 3956, 6204, 1181, 4001]