# Amazon Reviews Dataset

This dataset contains several million reviews of Amazon products, with the reviews separated into two classes for positive and negative reviews. The two classes are evenly balanced here.

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline

ModuleNotFoundError: No module named 'tensorflow'

In [29]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./input"))

['train.csv']


# Reading the text

The text is held in a compressed format. Luckily, we can still read it line by line. The first word gives the label, so we have to convert that into a number and then take the rest to be the comment.

In [30]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('../input/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('../input/test.ft.txt.bz2')

NameError: name 'bz2' is not defined

# Text Preprocessing

The first thing I'm going to do to process the text is to lowercase everything and then remove non-word characters. I replace these with spaces since most are going to be punctuation. Then I'm going to just remove any other characters (like letters with accents). It could be better to replace some of these with regular ascii characters but I'm just going to ignore that here. It also turns out if you look at the counts of the different characters that there are very few unusual characters in this corpus.

In [31]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

NameError: name 'train_texts' is not defined

# Train/Validation Split

Now I'm going to set aside 20% of the training set for validation.

In [32]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.2)

NameError: name 'train_texts' is not defined

Keras provides some tools for converting text to formats that are useful in learning models. I've already done some processing, so now I will just run a Tokenizer using the top 12000 words as features.

In [33]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

NameError: name 'Tokenizer' is not defined

# Padding Sequences

In order to use batches effectively, I'm going to need to take my sequences and turn them into sequences of the same length. I'm just going to make everything here the length of the longest sentence in the training set. I'm not dealing with this here, but it may be advantageous to have variable lengths so that each batch contains sentences of similar lengths. This might help mitigate issues that arise from having too many padded elements in a sequence. There are also different padding modes that might be useful for different models.

In [34]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

NameError: name 'train_texts' is not defined

# K Nearest Neighbours Model

In [None]:
# Python3 program to find groups of unknown 
# Points using K nearest neighbour algorithm. 

import math 

def classifyAPoint(points,p,k=3): 
    ''' 
    This function finds the classification of p using 
    k nearest neighbor algorithm. It assumes only two 
    groups and returns 0 if p belongs to group 0, else 
    1 (belongs to group 1). 

    Parameters - 
    points: Dictionary of training points having two keys - 0 and 1 
    Each key have a list of training data points belong to that 

    p : A tuple, test data point of the form (x,y) 

    k : number of nearest neighbour to consider, default is 3 
    '''

    distance=[] 
    for group in points: 
        for feature in points[group]: 

        #calculate the euclidean distance of p from training points 
        euclidean_distance = math.sqrt((feature[0]-p[0])**2 +(feature[1]-p[1])**2) 

        # Add a tuple of form (distance,group) in the distance list 
        distance.append((euclidean_distance,group)) 

        # sort the distance list in ascending order 
        # and select first k distances 
        distance = sorted(distance)[:k] 

        freq1 = 0 #frequency of group 0 
        freq2 = 0 #frequency og group 1 

        for d in distance: 
            if d[1] == 0: 
                freq1 += 1
            elif d[1] == 1: 
                freq2 += 1

    return 0 if freq1>freq2 else 1

# driver function 
def main(): 

    # Dictionary of training points having two keys - 0 and 1 
    # key 0 have points belong to class 0 
    # key 1 have points belong to class 1 

    points = {0:[(1,12),(2,5),(3,6),(3,10),(3.5,8),(2,11),(2,9),(1,7)], 
    1:[(5,3),(3,2),(1.5,9),(7,2),(6,1),(3.8,1),(5.6,4),(4,2),(2,5)]} 

    # testing point p(x,y) 
    p = (2.5,7) 

    # Number of neighbours 
    k = 3

    print("The value classified to unknown point is: {}".\ 
    format(classifyAPoint(points,p,k))) 

if __name__ == '__main__': 
main() 

