In [1]:
import re
import pandas as df
import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import pickle

import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/john/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Code to Help With Processing Data

In [2]:
def processWord(word):
    return re.sub(r'\W+', '', word)

def tokenize(x, y, onehot_dict):
    tokenized_x = [[onehot_dict[processWord(word)] for word in str(sentence).lower().split() if processWord(word) in onehot_dict] for sentence in x['Review'].values]

    encoded_y = [1 if generation_status == 1 else 0 for generation_status in y['Generated'].values]
    return np.array(tokenized_x), np.array(encoded_y)

### Import OneHot Dicts

In [3]:
with open("onehot_dicts/osf_onehot.pkl","rb") as file:
    osf_onehot_dict = pickle.load(file)
with open("onehot_dicts/imbalanced_onehot.pkl","rb") as file:
    imbalanced_onehot_dict = pickle.load(file)
with open("onehot_dicts/oversampled_onehot.pkl","rb") as file:
    oversampled_onehot_dict = pickle.load(file)
with open("onehot_dicts/undersampled_onehot.pkl","rb") as file:
    undersampled_onehot_dict = pickle.load(file)

### Import Small Amount of Data

In [4]:
OSFTest = df.read_csv('partitions/smallOSFTest.csv')
OSFXtest = OSFTest.drop(columns="Generated")
OSFytest = OSFTest.drop(columns="Review")

combinedTest = df.read_csv('partitions/smallCombinedTest.csv')
combinedXtest = combinedTest.drop(columns="Generated")
combinedYtest = combinedTest.drop(columns="Review")

In [5]:
# Input for OSF model to test on Small Combined Imbalanced Test Dataset
osf_combinedXtest_input, osf_combinedYtest_input = tokenize(combinedXtest, combinedYtest, osf_onehot_dict)
# Input for Imbalanced model to test on Small Combined Imbalanced Test Dataset
imbalanced_combinedXtest_input, imbalanced_combinedYtest_input = tokenize(combinedXtest, combinedYtest, imbalanced_onehot_dict)
# Input for Oversampled model to test on Small Combined Imbalanced Test Dataset
oversampled_combinedXtest_input, oversampled_combinedYtest_input = tokenize(combinedXtest, combinedYtest, oversampled_onehot_dict)
# Input for Undersampled model to test on Small Combined Imbalanced Test Dataset
undersampled_combinedXtest_input, undersampled_combinedYtest_input = tokenize(combinedXtest, combinedYtest, undersampled_onehot_dict)

# Input for OSF model to test on Small OSF Test dataset
osf_OSFXtest, osf_OSFYtest = tokenize(OSFXtest, OSFytest, osf_onehot_dict)
# Input for Imbalanced model to test on Small OSF Test dataset
imbalanced_OSFXtest, imbalanced_OSFYtest = tokenize(OSFXtest, OSFytest, imbalanced_onehot_dict)
# Input for Oversampled model to test on Small OSF Test dataset
oversampled_OSFXtest, oversampled_OSFYtest = tokenize(OSFXtest, OSFytest, oversampled_onehot_dict)
# Input for imbalanced model to test on Small OSF Test dataset
undersampled_OSFXtest, undersampled_OSFYtest = tokenize(OSFXtest, OSFytest, undersampled_onehot_dict)

## Pad Input Vectors

In [6]:
def padVectors(sentences, length):
    vectors = np.zeros((len(sentences), length), dtype=int)
    for i, sentence in enumerate(sentences):
        if len(sentence) != 0:
            l = min(length, len(sentence))
            vectors[i, -l:] = np.array(sentence)[:l]
    return vectors

In [7]:
osf_combinedXtest_padded = padVectors(osf_combinedXtest_input, 250)
imbalanced_combinedXtest_padded = padVectors(imbalanced_combinedXtest_input, 250)
oversampled_combinedXtest_padded = padVectors(oversampled_combinedXtest_input, 250)
undersampled_combinedXtest_padded = padVectors(undersampled_combinedXtest_input, 250)

osf_OSFXtest_padded = padVectors(osf_OSFXtest, 250)
imbalanced_OSFXtest_padded = padVectors(imbalanced_OSFXtest, 250)
oversampled_OSFXtest_padded = padVectors(oversampled_OSFXtest, 250)
undersampled_OSFXtest_padded = padVectors(undersampled_OSFXtest, 250)

### Create DataLoaders

In [8]:
osf_combined = torch.utils.data.TensorDataset(torch.from_numpy(osf_combinedXtest_padded), torch.from_numpy(osf_combinedYtest_input))
imbalanced_combined = torch.utils.data.TensorDataset(torch.from_numpy(imbalanced_combinedXtest_padded), torch.from_numpy(imbalanced_combinedYtest_input))
oversampled_combined = torch.utils.data.TensorDataset(torch.from_numpy(oversampled_combinedXtest_padded), torch.from_numpy(oversampled_combinedYtest_input))
undersampled_combined = torch.utils.data.TensorDataset(torch.from_numpy(undersampled_combinedXtest_padded), torch.from_numpy(undersampled_combinedYtest_input))

osf_OSF = torch.utils.data.TensorDataset(torch.from_numpy(osf_OSFXtest_padded), torch.from_numpy(osf_OSFYtest))
imbalanced_OSF = torch.utils.data.TensorDataset(torch.from_numpy(imbalanced_OSFXtest_padded), torch.from_numpy(imbalanced_OSFYtest))
oversampled_OSF = torch.utils.data.TensorDataset(torch.from_numpy(oversampled_OSFXtest_padded), torch.from_numpy(oversampled_OSFYtest))
undersampled_OSF = torch.utils.data.TensorDataset(torch.from_numpy(undersampled_OSFXtest_padded), torch.from_numpy(undersampled_OSFYtest))

# batch size for dataloaders
BATCH_SIZE = 64

osf_combined_loader = torch.utils.data.DataLoader(osf_combined, batch_size=BATCH_SIZE, drop_last=True)
imbalanced_combined_loader = torch.utils.data.DataLoader(imbalanced_combined, batch_size=BATCH_SIZE, drop_last=True)
oversampled_combined_loader = torch.utils.data.DataLoader(oversampled_combined, batch_size=BATCH_SIZE, drop_last=True)
undersampled_combined_loader = torch.utils.data.DataLoader(undersampled_combined, batch_size=BATCH_SIZE, drop_last=True)

osf_OSF_loader = torch.utils.data.DataLoader(osf_OSF, batch_size=BATCH_SIZE, drop_last=True)
imbalanced_OSF_loader = torch.utils.data.DataLoader(imbalanced_OSF, batch_size=BATCH_SIZE, drop_last=True)
oversampled_OSF_loader = torch.utils.data.DataLoader(oversampled_OSF, batch_size=BATCH_SIZE, drop_last=True)
undersampled_OSF_loader = torch.utils.data.DataLoader(undersampled_OSF, batch_size=BATCH_SIZE, drop_last=True)

### Model Class

In [9]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [10]:
class ReviewClassifier(nn.Module):
    def __init__(self,vocab_size,embedding_dim, hidden_dim):
        super(ReviewClassifier,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embedding layer
        embeds = self.embedding(x) 
        # LSTM layer
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        # Feed Forward Network
        out = self.fc(lstm_out)
        # sigmoid function
        sig_out = self.sigmoid(out)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
    def init_hidden(self, batch_size):
        # create zero tensors for the hidden state and cell state of LSTM
        h0 = torch.zeros((1,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((1,batch_size,self.hidden_dim)).to(device)
        return (h0,c0)

In [11]:
criterion = nn.BCELoss()

### Code for Calculating Accuracy, False Negatives and False Postives.

In [12]:
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

def return_misses(inputs, output, labels, vocab):
    pred = torch.round(output.squeeze())
    labels = labels.squeeze()
    misses = []
    miss_index = []
    false_negative = 0
    false_positive = 0
    for i in range(len(pred)):
        if pred[i] != labels[i]:
            if pred[i] == 0:
                false_positive += 1
            else:
                false_negative += 1
            misses.append(' '.join([vocab[inputs[i][j].item()] for j in range(len(inputs[i])) if inputs[i][j].item() != 0]))
            miss_index.append(i)
    return misses, miss_index, false_negative, false_positive

def do_test_stats(model, vocabulary, test_loader):
    test_h = model.init_hidden(BATCH_SIZE)
    test_losses = []
    test_acc = 0.0
    tot_misses = []
    model.eval()
    vocab_r = {v: k for k, v in vocabulary.items()}
    false_positives = 0
    false_negatives = 0
    tot_outputs = 0
    tot_labels = 0
    count = 0
    miss_indices = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            test_h = tuple([each.data for each in test_h])

            inputs, labels = inputs.to(device), labels.to(device)

            output, test_h = model(inputs, test_h)
            test_loss = criterion(output.squeeze(), labels.float())

            test_losses.append(test_loss.item())
            misses, miss_index, fneg, fpos = return_misses(inputs, output, labels, vocab_r)
            tot_misses.extend(misses)
            miss_indices.extend([count*64+i for i in miss_index])
            false_positives += fpos
            false_negatives += fneg
            accuracy = acc(output,labels)
            test_acc += accuracy

            tot_outputs = output if type(tot_outputs) == type(0) else torch.cat((tot_outputs, output))
            tot_labels = labels if type(tot_labels) == type(0) else torch.cat((tot_labels, labels))
            count += 1

    test_loss = np.mean(test_losses)
    test_acc = test_acc/(64*count)
    print(f'test_loss : {test_loss}')
    print(f'test_accuracy : {test_acc*100}')
    print(f'false_positive : {false_positives / (64*count)} ')
    print(f'false_negatives : {false_negatives / (64*count)} ')
    return tot_outputs, tot_labels, tot_misses, miss_indices

## Method To Run Model on Test Sets

In [13]:
def test_model(one_hot_dict, data_loader, model_state_dict_name, testX, testY):
    vocabulary_size = len(one_hot_dict)+1
    embedding_dim = 64
    hidden_dim = 256

    # Import Model
    model = ReviewClassifier(vocabulary_size, embedding_dim, hidden_dim)
    if is_cuda:
        model.load_state_dict(torch.load(model_state_dict_name))
    else:
        model.load_state_dict(torch.load(model_state_dict_name, map_location=torch.device('cpu')))
    model.to(device)
    model.eval()
    # Do Test Stats
    osf_OSF_tot_outputs, osf_OSF_tot_labels, osf_misses, osf_miss_indices = do_test_stats(model, one_hot_dict, data_loader)
    print("\nLabel of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs")

    for i in osf_miss_indices[:4]:
        print("\nLabel:", testY.iat[i,0])
        print("Model Prediction:", round(osf_OSF_tot_outputs[i].item()))
        print("Input Sentence:", testX.iat[i, 0])

### Models Testing on smallOSFTest.csv Dataset (Note Small Datasize so Accuracy does not reflect on entire dataset)

#### Test Model Trained on OSF Dataset on Small OSF Test Set

In [14]:
test_model(osf_onehot_dict, osf_OSF_loader, "state_dicts/best_acc_OSF_state_dict.pt", OSFXtest, OSFytest)

test_loss : 0.18482580284277597
test_accuracy : 92.70833333333334
false_positive : 0.026041666666666668 
false_negatives : 0.046875 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: Easy to use and works great I read reviews on this product all over the web before buying it and agree very good product

Label: 0
Model Prediction: 1
Input Sentence: Maddix is out there with a bunch of crazy people He is a detective a crimefighter a

Label: 0
Model Prediction: 1
Input Sentence: Our church has several fundraisers and an outdoor garden area I will be purchasing a new one for the garden area  The house is designed to be a very attractive place to live and provide a nice environment to live in  The outdoor garden area is very well made and has several sun and humidity controls  I will be buying a second one for the backyard area as well  I will be posting pictures of the outdoor gar

#### Test Model Trained on Imbalanced Combined Dataset on Small OSF Test Set

In [15]:
test_model(imbalanced_onehot_dict, imbalanced_OSF_loader, "state_dicts/best_acc_imbalanced_state_dict.pt", OSFXtest, OSFytest)

test_loss : 0.33766421178976697
test_accuracy : 86.97916666666666
false_positive : 0.041666666666666664 
false_negatives : 0.08854166666666667 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: Easy to use and works great I read reviews on this product all over the web before buying it and agree very good product

Label: 1
Model Prediction: 0
Input Sentence: Very great read keeps you on the edge of your seat its a must read book I would advise everyone to get and read so far so good

Label: 0
Model Prediction: 1
Input Sentence: Maddix is out there with a bunch of crazy people He is a detective a crimefighter a

Label: 0
Model Prediction: 1
Input Sentence: Great look nice by Im a size 10 5 I normally wear a medium in jeans and this is a perfect fit I am a 32D and the waist is very large


#### Test Model Trained on Oversampled Combined Dataset on Small OSF Test Set

In [16]:
test_model(oversampled_onehot_dict, oversampled_OSF_loader, "state_dicts/best_acc_oversampled_state_dict.pt", OSFXtest, OSFytest)

test_loss : 0.4014149059851964
test_accuracy : 92.96875
false_positive : 0.06510416666666667 
false_negatives : 0.005208333333333333 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: this is a great spotlightits very bright an cast a concentrated beam for a long distancei feel this spotlight will be great for many uses an would recomend to anybody

Label: 1
Model Prediction: 0
Input Sentence: Somewhat hard to clean not as convenient as I had hoped

Label: 1
Model Prediction: 0
Input Sentence: This car beats the pants off the other remote control toys because1 it lights up2 big soft rubber wheels dont cause damage when they bump furniture3 the twisting action is cool4 gets itself out of trouble5 it lights upThis toy is the best bang for the buck  gift for a 9year old and hit of the party and beyond

Label: 1
Model Prediction: 0
Input Sentence: My 5 year old grandson loves the

#### Test Model Trained on Undersampled Combined Dataset on Small OSF Test Set

In [17]:
test_model(undersampled_onehot_dict, undersampled_OSF_loader, "state_dicts/best_acc_undersampled_state_dict.pt", OSFXtest, OSFytest)

test_loss : 0.7437137961387634
test_accuracy : 61.71875
false_positive : 0.3697916666666667 
false_negatives : 0.013020833333333334 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: Works really good Keep the usb 30 fast meaning it is real usb 30 cable

Label: 1
Model Prediction: 0
Input Sentence: Easy to use and works great I read reviews on this product all over the web before buying it and agree very good product

Label: 1
Model Prediction: 0
Input Sentence: We were really worried this would not work but it works great you just screw your bulb in the adapter and plug it into your weird lamp and voila

Label: 1
Model Prediction: 0
Input Sentence: Very great read keeps you on the edge of your seat its a must read book I would advise everyone to get and read so far so good


### Models Testing on smallCombinedTest.csv Dataset (Note Small Datasize so Accuracy does not reflect on entire dataset)

#### Test Model Trained on OSF Dataset on Small, Imbalanced, Combined Test Set

In [18]:
test_model(osf_onehot_dict, osf_combined_loader, "state_dicts/best_acc_OSF_state_dict.pt", combinedXtest, combinedYtest)

test_loss : 1.014183759689331
test_accuracy : 86.45833333333334
false_positive : 0.015625 
false_negatives : 0.11979166666666667 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 0
Model Prediction: 1
Input Sentence: Terrific hearty Italian food in an awesome atmosphere

Label: 0
Model Prediction: 1
Input Sentence: I used to love the Raspberry Habernero wings but now they taste differnet im very dissapointed i dont know why this is There are very few good people in the waitstaff and sometimes ive heard yelling from the kitchen Sounds like the chef or manager back there needs a visit from the dept of labor Get a pizza they are fantastic and ask for Will to be your server

Label: 0
Model Prediction: 1
Input Sentence: I went in to try this place out I wasnt sure what to expect First the decor and atmosphere was really cool I felt very comfortable I sat at the Bar to eat only to be greeted by this Beautful redhead i

#### Test Model Trained on Imbalanced Combined Dataset on Small, Imbalanced, Combined Test Set

In [19]:
test_model(imbalanced_onehot_dict, imbalanced_combined_loader, "state_dicts/best_acc_imbalanced_state_dict.pt", combinedXtest, combinedYtest)

test_loss : 0.35364994406700134
test_accuracy : 86.97916666666666
false_positive : 0.005208333333333333 
false_negatives : 0.125 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 0
Model Prediction: 1
Input Sentence: Terrific hearty Italian food in an awesome atmosphere

Label: 0
Model Prediction: 1
Input Sentence: I used to love the Raspberry Habernero wings but now they taste differnet im very dissapointed i dont know why this is There are very few good people in the waitstaff and sometimes ive heard yelling from the kitchen Sounds like the chef or manager back there needs a visit from the dept of labor Get a pizza they are fantastic and ask for Will to be your server

Label: 0
Model Prediction: 1
Input Sentence: Went againstill love it as much as the first time and every time after

Label: 0
Model Prediction: 1
Input Sentence: I went in to try this place out I wasnt sure what to expect First the decor and atm

#### Test Model Trained on Oversampled Combined Dataset on Small, Imbalanced, Combined Test Set

In [20]:
test_model(oversampled_onehot_dict, oversampled_combined_loader, "state_dicts/best_acc_oversampled_state_dict.pt",  combinedXtest, combinedYtest)

test_loss : 0.29389676948388416
test_accuracy : 90.36458333333334
false_positive : 0.08854166666666667 
false_negatives : 0.0078125 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: The place is cute and cozy and right in the heart of one of my favorite neighborhoods in the city Its a great place to stop in late and have a dessert and tea The staff are extremely friendly and make you feel right at home My wife and I enjoy Cluny when we are in the area

Label: 1
Model Prediction: 0
Input Sentence: Love this place Food is yummy But Im definitely giving a chunk of my paycheck to this restaurant Lol Worth it

Label: 1
Model Prediction: 0
Input Sentence: One of the best pizzas Ive ever eaten So so good Need to go back asap Lines are long so I would suggest making a reservation Its also BYOB bring a bottle of wine so that you can enjoy it with the delicious pizza 

Label: 1
Model 

#### Test Model Trained on Undersampled Combined Dataset on Small, Imbalanced, Combined Test Set

In [21]:
test_model(undersampled_onehot_dict, undersampled_combined_loader, "state_dicts/best_acc_undersampled_state_dict.pt", combinedXtest, combinedYtest)

test_loss : 0.5755191991726557
test_accuracy : 67.44791666666666
false_positive : 0.28125 
false_negatives : 0.044270833333333336 

Label of 1 means Real Review. Label of 0 means Fake or Computer Generated. Printing 4 of the miss predicted inputs

Label: 1
Model Prediction: 0
Input Sentence: The place is cute and cozy and right in the heart of one of my favorite neighborhoods in the city Its a great place to stop in late and have a dessert and tea The staff are extremely friendly and make you feel right at home My wife and I enjoy Cluny when we are in the area

Label: 1
Model Prediction: 0
Input Sentence: We found our way here in search of ceviche and left very satisfied Definitely try the ceviche obv lobster taquitos and sangria braised short ribs To top it off the service was excellent Well definitely be back

Label: 1
Model Prediction: 0
Input Sentence: Love this place Food is yummy But Im definitely giving a chunk of my paycheck to this restaurant Lol Worth it

Label: 1
Model Predi