In [None]:
# Learning Pytorch
# By writing a simple sentiment analyzer
# Uses 1 hot encoding/ BoW

In [1]:
import os
import sys, csv
import nltk
from nltk.corpus import stopwords
import torch
import torchtext
import pandas as pd
from collections import defaultdict
from torchtext.datasets import text_classification
from torchtext.utils import download_from_url, extract_archive
from sklearn.preprocessing import OneHotEncoder

In [2]:
# constants
NGRAMS = 3
Train_Data_Size = 10000
Test_Data_Size = 1000

In [3]:
# hard code or else, OverFlowError in csv.field_size_limit
# sys.maxsize = 2**16 - 1

In [4]:
# Source : https://pytorch.org/text/_modules/torchtext/datasets/text_classification.html
URLS = {
    'AmazonReviewPolarity':
        'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbaW12WVVZS2drcnM'
}

In [25]:
# File sizes are too big, will use a chunk of it, train : 10000 rows, test 1000 rows
tar_data = download_from_url(URLS['AmazonReviewPolarity'])
extracted_files = extract_archive(tar_data)

In [28]:
extracted_files

['.data\\amazon_review_polarity_csv/test.csv',
 '.data\\amazon_review_polarity_csv/train.csv',
 '.data\\amazon_review_polarity_csv/readme.txt']

In [3]:
train_data = pd.read_csv('.data\\amazon_review_polarity_csv/train.csv', chunksize=1000).get_chunk(Train_Data_Size)
test_data = pd.read_csv('.data\\amazon_review_polarity_csv/test.csv', chunksize=1000).get_chunk(Test_Data_Size)

In [4]:
# Rename Columns
# 1 = -ve, 2 = +ve
train_data.set_axis(['Sentiment', 'Title', 'Review'], axis=1, inplace=True)
test_data.set_axis(['Sentiment', 'Title', 'Review'], axis=1, inplace=True)

In [5]:
# seems positive/ negative split is equally balanced
train_data[train_data["Sentiment"]==2].shape

(4902, 3)

In [6]:
train_data.head()

Unnamed: 0,Sentiment,Title,Review
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# convert to dict, for easy deltion in Review
stop_word = defaultdict(lambda : 0)
stop_word.update(dict(zip(stopwords.words('english'), [1]*len(stopwords.words('english')))))

In [9]:
# delete stopwords function 
# also converts to lowercase
del_stopW = lambda row : ' '.join([x.strip(",").strip(".").strip("'").strip('"').strip("!") \
                                   if not stop_word[x] else '' for x in row.lower().split()])

In [10]:
train_data["Review"] = train_data["Review"].apply(del_stopW)

In [11]:
train_data.head()

Unnamed: 0,Sentiment,Title,Review
0,2,The best soundtrack ever to anything.,i'm reading lot reviews saying best game...
1,2,Amazing!,soundtrack favorite music time hands down...
2,2,Excellent Soundtrack,truly like soundtrack enjoy video game mus...
3,2,"Remember, Pull Your Jaw Off The Floor After He...",played game know divine music is every s...
4,2,an absolute masterpiece,quite sure actually taking time read ...


In [36]:
# create 1-hot vector 
# first find max length of review
max_len = -1
for review in train_data["Review"].tolist():
    max_len = max(len(review.split()), max_len)

In [46]:
# 
max_len

140

In [40]:
# sparse=False, to get array
# EOS = End of string
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_X = enc.fit_transform([_.split()+['EOS']*(max_len - len(_.split())) for _ in train_data["Review"].tolist()])