In [1]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf
import re

In [35]:
from itertools import starmap

In [2]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, Activation
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence, hashing_trick
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [17]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rahul_padmanabhan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 999

##### Setting Google News Word2vec model path

In [5]:
google_model_path = '../../../LearnSpace/GoogleNews-vectors-negative300.bin.gz'

##### Loading Google News W2V model

In [6]:
google_w2v = gensim.models.KeyedVectors.load_word2vec_format(google_model_path, binary=True)

##### Regex way to remove stopwords (faster)

In [7]:
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('English')) + r')\b\s*')

##### Stemmer & lemmatizer initialization

In [18]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

#### Loading sentiment analysis dataframes

In [9]:
train_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')

In [10]:
test_df = pd.read_csv('../data/sentiment-analysis-on-movie-reviews/test.tsv', sep='\t')

In [11]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [12]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine effort .
1,156062,8545,An intermittently pleasing but mostly routine effort
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


#### Getting the values from the dataframe

In [13]:
train_phrases = train_df['Phrase'].values
train_sentiment = train_df['Sentiment'].values

#### testing phrases

In [14]:
test_phrases = test_df['Phrase'].values

### Checking number of labels

In [15]:
label_set = np.unique(train_sentiment)
labels = len(label_set)

In [15]:
labels

5

In [64]:
def get_w2v(sentence):
    word_vector = []
    words = word_tokenize(sentence)
    for word in words:
        word = word.lower()
        try:
            w2v = google_w2v[word]
            word_vector.append(w2v)
        except KeyError:
            try:
                w2v = google_w2v[lemmatizer.lemmatize(word)]
                word_vector.append(w2v)
            except KeyError:
                try:
                    w2v = google_w2v[stemmer.stem(word)]
                    word_vector.append(w2v)
                except KeyError:
                    w2v = np.zeros(300)
                    word_vector.append(w2v)
    
    return word_vector   

In [68]:
train_df['phrase_w2v'] = train_df['Phrase'].map(get_w2v)

In [153]:
type(train_df.phrase_w2v.head())

pandas.core.series.Series

#### Count number of word vectors

In [121]:
max([len(i) for i in train_df['phrase_w2v'] for j in i])

53

In [134]:
train_df['is_null'] = train_df['phrase_w2v'].isnull()

In [154]:
train_df['length'] = train_df['phrase_w2v'].map(lambda x : np.array(x).shape[0])

#### Max length of sequence

In [164]:
max_len = train_df['length'].max()

In [160]:
train_df['phrase_w2v_np'] = train_df['phrase_w2v'].map(lambda x : np.array(x))

In [None]:
train_df['phrase_w2v_np_pad'] = train_df['phrase_w2v_np'].map(lambda x : pad_sequences(x, maxlen=)