In [87]:
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import plot_confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
import torch
from torchtext.vocab import GloVe
import tqdm
import torch.nn as nn
import torch.nn.functional as F

In [88]:
def regression_report(y_true, y_pred, estimator=""):
    """
        Helper function for error outputs.
        
        Parameters
        ----------
        y_true : numpy.1darray
            true labels for test data
        
        y_pred : numpy.1darray
            predicted labels for test data
        
        estimator : str
            name of estimator (for output purposes)
        
        Returns
        -------
        None
    """
    print(f"F1:  {metrics.f1_score(y_true, y_pred)}")
    print(f"F1 Macro:  {metrics.f1_score(y_true, y_pred, average='macro')}")

    print(f"Accuracy:  {metrics.accuracy_score(y_true, y_pred)}")

In [89]:
glove = GloVe(cache='.', name='6B')

In [90]:
df = pd.DataFrame()

In [91]:
GLOVE_DIM = 300
ZERO_EMBED = torch.zeros(GLOVE_DIM) # GloVe dimensionality
NUM_EXAMPLES = 1000

In [92]:
FILENAME = "yelp_old_slim.txt"

status_list = []
df = pd.DataFrame()

with open(FILENAME, 'r') as f:
    
    index = f.readline().strip()

    with tqdm.tqdm(total=NUM_EXAMPLES) as pbar:
        while index.isnumeric():
            status = f.readline().strip()
            original = f.readline().strip()
            perturbed = f.readline().strip()
            
#             print(f"Currently on: index {index}")

            changed_idxs = []

            if 'FAILED' not in status and 'SKIPPED' not in status:
                orig_tokens = [token.lower() for token in nltk.word_tokenize(original)]
                pert_tokens = [token.lower() for token in nltk.word_tokenize(perturbed)]

                min_len = min(len(orig_tokens), len(pert_tokens))    

                changed_idxs = [i for i in range(min_len) if orig_tokens[i] != pert_tokens[i]]
                
                if len(changed_idxs) <= 10:

                    v1 = ZERO_EMBED
                    v2 = glove[orig_tokens[0]]
                    v3 = glove[orig_tokens[1]]
                    v5 = glove[pert_tokens[0]]
                    example_0 = pd.Series([v1, v2, v3, torch.cat([v1, v2, v3], axis=-1), 0 in changed_idxs, v5])
                    df = df.append(example_0, ignore_index=True)

                    for i in range(1, min_len-1):

                        # (embed left, embed curr, embed right, is_modified)
                        v1 = glove[orig_tokens[i-1]]
                        v2 = glove[orig_tokens[i]]
                        v3 = glove[orig_tokens[i+1]]
                        v5 = glove[pert_tokens[i]]

                        word_data = pd.Series([v1, v2, v3, torch.cat([v1, v2, v3], axis=-1), i in changed_idxs, v5])
                        df = df.append(word_data, ignore_index=True)

                    v1 = glove[orig_tokens[min_len-2]]
                    v2 = glove[orig_tokens[min_len-1]]
                    v3 = ZERO_EMBED
                    v5 = glove[orig_tokens[min_len-1]]
                    example_end = pd.Series([v1, v2, v3, torch.cat([v1, v2, v3], axis=-1), min_len-1 in changed_idxs, v5])
                    df = df.append(example_end, ignore_index=True)

            
            status_list.append(changed_idxs)
            f.readline()
            index = f.readline().strip()
            
            pbar.update(1)

100%|██████████| 1000/1000 [05:46<00:00,  2.89it/s]


In [93]:
df

Unnamed: 0,0,1,2,3,4,5
0,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....","[tensor(0.6684), tensor(-0.1482), tensor(0.003...","[tensor(0.1009), tensor(-0.0449), tensor(-0.36...","[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,"[tensor(0.6684), tensor(-0.1482), tensor(0.003..."
1,"[tensor(0.6684), tensor(-0.1482), tensor(0.003...","[tensor(0.1009), tensor(-0.0449), tensor(-0.36...","[tensor(-0.6805), tensor(-0.1708), tensor(0.04...","[tensor(0.6684), tensor(-0.1482), tensor(0.003...",1.0,"[tensor(-0.1902), tensor(0.1181), tensor(0.287..."
2,"[tensor(0.1009), tensor(-0.0449), tensor(-0.36...","[tensor(-0.6805), tensor(-0.1708), tensor(0.04...","[tensor(-0.2554), tensor(-0.2572), tensor(0.13...","[tensor(0.1009), tensor(-0.0449), tensor(-0.36...",0.0,"[tensor(-0.6805), tensor(-0.1708), tensor(0.04..."
3,"[tensor(-0.6805), tensor(-0.1708), tensor(0.04...","[tensor(-0.2554), tensor(-0.2572), tensor(0.13...","[tensor(-0.1947), tensor(0.1884), tensor(0.117...","[tensor(-0.6805), tensor(-0.1708), tensor(0.04...",0.0,"[tensor(-0.2554), tensor(-0.2572), tensor(0.13..."
4,"[tensor(-0.2554), tensor(-0.2572), tensor(0.13...","[tensor(-0.1947), tensor(0.1884), tensor(0.117...","[tensor(-0.4762), tensor(0.0695), tensor(-0.01...","[tensor(-0.2554), tensor(-0.2572), tensor(0.13...",0.0,"[tensor(-0.1947), tensor(0.1884), tensor(0.117..."
...,...,...,...,...,...,...
40124,"[tensor(-0.3303), tensor(0.4575), tensor(-0.29...","[tensor(-0.6594), tensor(0.2651), tensor(0.166...","[tensor(-0.0067), tensor(0.0233), tensor(0.056...","[tensor(-0.3303), tensor(0.4575), tensor(-0.29...",0.0,"[tensor(-0.6594), tensor(0.2651), tensor(0.166..."
40125,"[tensor(-0.6594), tensor(0.2651), tensor(0.166...","[tensor(-0.0067), tensor(0.0233), tensor(0.056...","[tensor(0.), tensor(0.), tensor(0.), tensor(0....","[tensor(-0.6594), tensor(0.2651), tensor(0.166...",0.0,"[tensor(-0.0067), tensor(0.0233), tensor(0.056..."
40126,"[tensor(-0.0067), tensor(0.0233), tensor(0.056...","[tensor(0.), tensor(0.), tensor(0.), tensor(0....","[tensor(0.0727), tensor(0.1596), tensor(0.0654...","[tensor(-0.0067), tensor(0.0233), tensor(0.056...",1.0,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
40127,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....","[tensor(0.0727), tensor(0.1596), tensor(0.0654...","[tensor(-0.1256), tensor(0.0136), tensor(0.103...","[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,"[tensor(0.0727), tensor(0.1596), tensor(0.0654..."


In [94]:
train_X_df, val_X_df, train_y_w_replaced, val_y_w_replaced = train_test_split(df.iloc[:, 3], df.iloc[:, [4, 5]], test_size=0.2, random_state=200)

In [95]:
train_X = np.stack(train_X_df)
val_X = np.stack(val_X_df)

In [96]:
train_y = np.stack(train_y_w_replaced.iloc[:, 0])
val_y = np.stack(val_y_w_replaced.iloc[:, 0])

In [97]:
lr = LogisticRegression(penalty='l2', class_weight='balanced', max_iter=1000)
lr.fit(train_X, train_y)

LogisticRegression(class_weight='balanced', max_iter=1000)

In [98]:
train_y_pred = lr.predict(train_X)
print(regression_report(train_y, train_y_pred))

F1:  0.30410480349344976
F1 Macro:  0.5765349250616085
Accuracy:  0.7517988972993178
None


In [99]:
val_y_pred = lr.predict(val_X)
print(regression_report(val_y, val_y_pred))

F1:  0.23697011813759555
F1 Macro:  0.535139074553844
Accuracy:  0.7263892349862946
None


In [100]:
np.count_nonzero(val_y_pred)

2381

In [101]:
np.count_nonzero(val_y)

497

In [102]:
np.count_nonzero(train_y_pred)

9382

In [103]:
np.count_nonzero(train_y)

2068

In [104]:
len(val_y)

8026

In [105]:
len(train_y)

32103

In [80]:
torch.save(lr, "lr_prediction_model")

# Generation

In [56]:
#lr = torch.load("lr_model")

In [106]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [147]:
class LogRegGeneration(nn.Module):
    def __init__(self):
        super(LogRegGeneration, self).__init__()
        self.linear1 = nn.Linear(GLOVE_DIM * 3, GLOVE_DIM) # 3 concatenated tokens
    
    def forward(self, x):
        return self.linear1(x)
    
weight_seed = 200
torch.manual_seed(weight_seed)
model = LogRegGeneration()
model.to(device)

LogRegGeneration(
  (linear1): Linear(in_features=900, out_features=300, bias=True)
)

In [108]:
import torch.optim as optim
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-4)

In [155]:
df_perturbed = df[df[4] == 1]

In [157]:
df_X = df_perturbed.iloc[:, 3]

In [158]:
df_y = df_perturbed.iloc[:, 5]

In [161]:
X_train, X_val, y_train, y_val = train_test_split(torch.Tensor(np.stack(df_X)), torch.Tensor(np.stack(df_y)), test_size=0.2, random_state=200)

In [162]:
X_train.shape

torch.Size([2052, 900])

In [163]:
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)

In [164]:
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)

In [168]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)

In [169]:
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=20, shuffle=True)

In [170]:
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, (X, y) in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        X = X.to(device)
        y = y.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 200 == 199:    # print every 200 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

print('Finished Training')

Finished Training


In [174]:
glove_lengths = torch.sqrt((glove.vectors ** 2).sum(dim=1))

def closest_cosine(vec):
    numerator = (glove.vectors * vec).sum(dim=1)
    denominator = glove_lengths * torch.sqrt((vec ** 2).sum())
    similarities = numerator / denominator
    return glove.itos[similarities.argmax()]

# Tests

In [189]:
closest_cosine(df_perturbed.iloc[5, 5])

'awesome'

In [188]:
test = df_perturbed.iloc[4, 3].to(device)
ans = model(test).to('cpu')
closest_cosine(ans)

'brutality'

# How to Generate

In [193]:
def generate(left_word, curr_word, right_word):
    orig_vector = torch.cat((glove[left_word], glove[curr_word], glove[right_word])).to(device)
    new_vector = model(orig_vector).to('cpu')
    return closest_cosine(new_vector)

In [194]:
print(generate("I", "saw", "him"))

acted


In [197]:
print(generate("I", "ate", "dinner"))

wibulswasdi


In [None]:
torch.save(model.state_dict(), "lr_generation_model")