In [1]:
# Imports 
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
import re
from collections import Counter

In [9]:
def read_reviews():
    reviews = pd.read_csv("../Dataset/Reviews.csv")
    reviews = reviews.dropna()
    reviews = reviews.drop(["Id","ProductId","UserId","ProfileName","HelpfulnessNumerator","HelpfulnessDenominator","Score","Time"]
                 ,axis=1)
    return reviews

reviews = read_reviews()
reviews.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [10]:
reviews[reviews.isnull().any(axis=1)] # All cells have values

Unnamed: 0,Summary,Text


In [27]:
# Cleaning and Normalizing the text and summaries
# Some contraction to expansion
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
def normalization(review,remove_stopwords=False):
    text = review.lower()
    clean_text = []
    for word in text.split():
        if word in contractions:
            clean_text.append(contractions[word])
        else:
            clean_text.append(word)
    text = " ".join(clean_text)
    
    # Format words and remove unwanted characters
#     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'https', ' ', text)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br', ' ', text)
    text = re.sub(r'/>', ' ', text)
    text = re.sub(r'>', ' ', text)
    text = re.sub(r'<', ' ', text)
    text = re.sub(r'`', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text


In [29]:
normalization(reviews.Text[713])

'   http   www amazon com gp product b000gwlugu  plocky s tortilla chips  red beans  n rice  7 ounce bag  pack of 12   a  i first tasted these chips while visiting relatives in ky  they are not available where i live  so i ordered them from amazon  wow  my friends and family are all addicted to them  the spicy flavor grabs you at the first bite  once a bag is open  it is gone '

In [31]:
def clean_reviews(texts):
    return [normalization(text) for text in texts]

In [36]:
summary = clean_reviews(reviews.Summary)
text = clean_reviews(reviews.Text)

In [38]:
print("None count in Summary ",sum(x is None for x in summary))
print("None count in Text ",sum(x is None for x in text))
print(len(summary),len(text))

None count in Summary  0
None count in Text  0
568412 568412


In [45]:
def get_vocab(text,summary):
    '''
    Param: Text, Summary
    Return: Vocab,vocab_to_int,int_to_vocab
    '''
    tokens = set()
    tokens.extend([sent.split() for sent in text])
    tokens.extend([sent.split() for sent in summary])
    
    vocab = set(tokens)
    
    vocab_to_int = {}
    int_to_vocab = {}
    
    codes = ["<UNK>","<PAD>","<EOS>","<GO>"]
    for i,code in enumerate(codes):
        vocab_to_int[code] = i

    for i,word in enumerate(vocab,4):
        vocab_to_int[word] = i
        
    int_to_vocab = {i:word for word,i in vocab_to_int.items()}
    return vocab,vocab_to_int,int_to_vocab

In [46]:
vocab,vocab_to_int,int_to_vocab = get_vocab(text,summary)

TypeError: unhashable type: 'list'