In [None]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from IPython.display import display
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter
import numpy
import string

In [None]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path, encoding="utf8") as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader, None)
        for line in reader:
            (Id, Text, Rating, Verified, ProductID, Label) = parseReview(line)
            rawData.append((Id, Text, Rating, Verified, ProductID, Label))
            preprocessedData.append((Id, preProcess(Text, Rating, Verified, ProductID), Label))

def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Rating, Verified, ProductID, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text, Rating, Verified, ProductID)),Label))
    for (_, Text, Rating, Verified, ProductID, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text, Rating, Verified, ProductID)),Label))

# Question 1

In [None]:
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
  ID = 0
  TEXT = 8
  LABEL = 1
  RATING = 2
  VERIFIED_PURCHASE = 3
  PRODUCT_ID = 5
  tuple = (int(reviewLine[ID]), reviewLine[TEXT], int(reviewLine[RATING]), True if reviewLine[VERIFIED_PURCHASE] == 'Y' else False, reviewLine[PRODUCT_ID], fakeLabel if reviewLine[LABEL]=='__label1__' else realLabel)
  return tuple
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    # DESCRIBE YOUR METHOD IN WORDS
    # return (None, None, None)

In [None]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text, rating, verified, product_id):
    # Tokenizing using nltk's word_tokenize
    tokens = word_tokenize(text.lower())

    # Removing URLs
    tokens = [re.sub(r'http\S+', '', tok) for tok in tokens]

    # Removing stopwords
    filtered_tokens = [w for w in tokens if w not in stop_words]

    # Removing punctuations
    filtered_tokens = [tok for tok in filtered_tokens if tok not in string.punctuation]

    # Stemming and lemmatisation
    filtered_tokens = [porter_stemmer.stem(wordnet_lemmatizer.lemmatize(tok)) for tok in filtered_tokens]

    return (filtered_tokens, rating, verified, product_id)
    # Should return a list of tokens
    # DESCRIBE YOUR METHOD IN WORDS
    # return []

# Question 2

In [None]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
  featurevect = dict(Counter(tokens[0]))
  featurevect['rating'] = tokens[1]
  featurevect['verified'] = int(tokens[2])
  # featurevect['productID'+tokens[3]] = 1
  return featurevect
    # Should return a dictionary containing features as keys, and weights as values
    # DESCRIBE YOUR METHOD IN WORDS

In [None]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [None]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    # DESCRIBE YOUR METHOD IN WORDS
    for i in range(0,len(dataset),foldSize):
       testFold = dataset[i:i+foldSize] # This our fold used for testing.
       trainingData = dataset[0:i] + dataset[foldSize:]
       classifier = trainClassifier(trainingData)
       y_pred = predictLabels(testFold, classifier)
       y_true = list(map(lambda t: t[1], testFold))
       results = list(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
       results[3] = accuracy_score(y_true, y_pred) * 100
       cv_results.append(tuple(results))

    return cv_results

In [None]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize

# Run this once
nltk.download('stopwords')
nltk.download('punkt')
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from sklearn import preprocessing
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = []
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath)

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')
cvResults = crossValidate(trainData, 10)
display(cvResults)

print("Done!")
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
0
Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...




[(0.7721106152024811,
  0.7720238095238096,
  0.7720538064867261,
  77.20238095238095),
 (0.9846188421103192,
  0.9845238095238096,
  0.9845210441900337,
  98.45238095238096),
 (0.9738323661777428,
  0.9738095238095238,
  0.9738085215078749,
  97.38095238095238),
 (0.977982942734907,
  0.9779761904761904,
  0.9779764479935069,
  97.79761904761905),
 (0.9792030456277213,
  0.9791666666666666,
  0.9791678552988425,
  97.91666666666666),
 (0.9774173913043478,
  0.9773809523809524,
  0.977376971957069,
  97.73809523809524),
 (0.9815640292261484, 0.981547619047619, 0.9815471286710225, 98.1547619047619),
 (0.9792020093793713,
  0.9791666666666666,
  0.979167338416546,
  97.91666666666666),
 (0.9774288751645237,
  0.9773809523809524,
  0.9773827480422567,
  97.73809523809524),
 (0.9714533530857719,
  0.9714285714285714,
  0.9714285714285714,
  97.14285714285714)]

Done!


# Evaluate on test set

In [None]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

({'assort': 1, 'realli': 1, 'hershey': 1, "'s": 1, 'best': 1, 'littl': 1, 'one': 1, 'alway': 1, 'excit': 1, 'whenev': 1, 'holiday': 1, 'come': 1, 'rating': 5, 'verified': 0}, 'fake')
Training Classifier...
Done training!
Precision: 0.740428
Recall: 0.740238
F Score:0.740187




# Questions 4

* Code Duplication: In the preProcess function, we are filtering stopwords twice. This redundancy can be eliminated.

* Tokenization: Instead of simply using split(" ") for tokenization, we will be using a more sophisticated tokenizer like nltk.word_tokenize(). This will handle punctuations and other delimiters better.

* Handling Punctuations: The code attempts to remove punctuation using string.punctuation but doesn't use the str.translate method properly. Changing it to a regex-based approach.

* Feature Extraction: When converting to a feature vector, the product ID is used directly. Now prefixing it with a specific string (e.g., "PRODUCT_") to distinguish it from other features.

* Numerical Features Normalization: Features like 'rating' might benefit from normalization to ensure they're on a similar scale as other features. Now scaling the ratings between 0 and 1.

* Extend Stopword Removal: Reviews might contain common words specific to product reviews that don't contribute much to the classification. Words/phrases like "product", "amazon", "purchase" might be common and not particularly informative.

* Preservation and Removal:
   1. Preserve emoticons: Reviews might have emoticons, which can be quite informative about the sentiment.
   2. Remove URLs: They are probably not informative for classification.
   3. We should be removing numbers or handling them differently.

