# CODE_Transferaufgabe

In [56]:
from flair.data import Sentence
from flair.embeddings import BertEmbeddings, ELMoEmbeddings, FastTextEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, WordEmbeddings

from nltk import word_tokenize
from nltk.corpus import stopwords
punctuation = ['!', '#','$','%','&', "'", '(',')','*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
               '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '`', '``', 'wurde', 'wurden']
import numpy as np
import pandas as pd
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torchtext import data, datasets

In [9]:
SMALL_SPLIT = True

In [10]:
corpus = pd.read_csv("../corpora/small_amazon_reviews_electronic.csv")
corpus.head(3)

Unnamed: 0,rating,name,review,verified,vote,date
0,1.0,Mike L,Bought for Christmas present for my Grandson h...,True,0.0,01.02.2018
1,1.0,Gustavo Villalta Woltke,Broken in months,True,0.0,23.05.2018
2,1.0,David,The latest driver for this product on the Asus...,False,0.0,15.05.2018


In [11]:
corpus.shape

(75000, 6)

## HELPER FUNCTIONS

In [12]:
def df_to_jsonl(df, filename, text_col="review", output_path="../corpora/splits/"):
    """ DataFrame with text column to Json Line Format. """

    df[text_col] = df.apply(lambda row: word_tokenize(row[text_col]), axis=1)
    df.to_json(f"{output_path}{filename}.json", orient='records', lines=True)

In [13]:
def split_corpus(corpus, 
                 text_col = "review", 
                 label_col = "rating", 
                 split = 0.8,
                 output_path = "../corpora/splits/"):
    """ Splits corpus in Train, Val and Test set and saves them 
        as jsonl files.
    """
    
    X_train, X_remain = train_test_split(corpus, 
                                         train_size=split,
                                         stratify=corpus[label_col])

    val_test_split = int((corpus.shape[0] * 0.2)/2)
    X_val = X_remain[:val_test_split]
    X_test = X_remain[val_test_split:]



    df_to_jsonl(X_train, "train", text_col = text_col, output_path = output_path)
    df_to_jsonl(X_val, "val")
    df_to_jsonl(X_test, "test")

In [14]:
def categorical_accuracy(preds, y):
    """ Returns accuracy per batch. """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [15]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## MAIN CODE

### split code

In [17]:
if SMALL_SPLIT:
    corpus = corpus.sample(1000)
    split_corpus(corpus, output_path = "../corpora/splitssmall/")
else:
    split_corpus(corpus, split=0.6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### hyperparams and settings

In [46]:
MAX_VOCAB_SIZE = 10000
BATCH_SIZE = 64
N_EPOCHS = 2

EMBEDDING_TYPE = "glove.6B.100d"

### preprocessing

In [48]:
stop_words = stopwords.words('english') + punctuation 

REVIEW = data.Field(tokenize = "toktok",
                    lower = True,
                    stop_words=stop_words)

RATING = data.LabelField()
assigned_fields = {"review": ('text', REVIEW), 
                   "rating": ('label', RATING)}

train_data, val_data, test_data = data.TabularDataset.splits(path="../corpora/splits/", 
                                                              train='train.json',
                                                              validation='val.json', 
                                                              test='test.json', 
                                                              format='json',
                                                              fields=assigned_fields,
                                                              skip_header = True)



REVIEW.build_vocab(train_data, 
                   #vectors = EMBEDDING_TYPE, 
                   #unk_init = torch.Tensor.normal_,
                   max_size = MAX_VOCAB_SIZE)
RATING.build_vocab(train_data)



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits((train_data, val_data, test_data), 
                                                                         batch_size = BATCH_SIZE,
                                                                         device = device,
                                                                         sort_key = lambda x: len(x.text),
                                                                         sort = False,
                                                                         sort_within_batch=False)

### creating embedding vectors

In [52]:
sentence = Sentence(' '.join(REVIEW.vocab.itos))
glove_embeddings = WordEmbeddings('glove').embed(sentence)
#vectors = torch.tensor(2)
#embedding = nn.Embedding.from_pretrained(vectors)

### cnn model

In [245]:
class CNN(nn.Module): #erbt immer von nn.Module
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)  
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
            
        return self.fc(cat)

In [246]:
INPUT_DIM = len(REVIEW.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(RATING.vocab)
DROPOUT = 0.5
PAD_IDX = REVIEW.vocab.stoi[RATING.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)


OPTIMIZER = optim.Adam(model.parameters())
CRITERION = nn.CrossEntropyLoss()

### init embeddings

In [247]:
pretrained_embeddings = REVIEW.vocab.vectors #load embeddings
#model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = REVIEW.vocab.stoi[REVIEW.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [248]:
# put model and loss criterion to device (cpu or gpu)
model = model.to(device)
criterion = criterion.to(device)

### train function

In [265]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() # Model lernt was (Zustand des Modells)
    
    for batch in iterator:
        
        optimizer.zero_grad() # Gradienten müssen auf 0 gesetzt werden
        predictions = model(batch.text)
        loss = criterion(predictions, batch.label) # berechnet loss (y-y^)
        acc = categorical_accuracy(predictions, batch.label)
        loss.backward() # sammelt Gradienten für jeden Parameter
        optimizer.step() # updated parameter basierend auf den Gradienten
        
        epoch_loss += loss.item() # item extrahiert den loss Wert
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### eval function

In [264]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() # Gewichte sind hier eingefroren
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text)
            loss = criterion(predictions, batch.label)
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### actual training

In [251]:
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, OPTIMIZER, CRITERION)
    valid_loss, valid_acc = evaluate(model, val_iterator, CRITERION)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # TODO: string anpassen
        torch.save(model.state_dict(), 'savefiles/cnnmodel.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 23s
	Train Loss: 1.876 | Train Acc: 20.88%
	 Val. Loss: 1.767 |  Val. Acc: 20.75%
Epoch: 02 | Epoch Time: 1m 54s
	Train Loss: 1.868 | Train Acc: 20.46%
	 Val. Loss: 1.767 |  Val. Acc: 20.75%


In [213]:
model.load_state_dict(torch.load('savefiles/cnnmodel.pt'))
test_loss, test_acc = evaluate(model, test_iterator, CRITERION)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.880 | Test Acc: 17.24%


In [221]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [REVIEW.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [237]:
pred_class = predict_class(model, "")
print(f'Predicted class is: {pred_class}')

Predicted class is: 1


In [117]:
example = vars(train_data.examples[0])
print(example["review"][:20])
print()
print(f"Einzigarte Tokens im REVIEW-Vokabular: {len(REVIEW.vocab)}")
print(f"Einzigarte Tokens im RATING-Vokabular: {len(RATING.vocab)}")
print()
print(REVIEW.vocab.freqs.most_common(20))
print()
print(REVIEW.vocab.itos[:10])
print()
batch = next(iter(train_iterator))
print(batch.review)
print()
print(batch.rating)

['works', 'as', 'advertisedbut', 'really', 'do', 'not', 'use', 'it', 'that', 'much', 'maybe', 'some', 'handy', 'the', 'future']

Einzigarte Tokens im REVIEW-Vokabular: 25002
Einzigarte Tokens im RATING-Vokabular: 5

[('the', 155453), ('i', 94375), ('to', 84631), ('it', 76278), ('a', 75353), ('and', 75229), ('not', 57672), ('is', 50266), ('for', 39894), ('this', 39379), ('of', 39150), ('my', 31541), ('that', 29659), ('but', 28925), ('with', 28467), ('on', 27336), ('have', 26603), ('you', 23607), ('as', 18550), ('are', 16893)]

['<unk>', '<pad>', 'the', 'i', 'to', 'it', 'a', 'and', 'not', 'is']

tensor([[342,  11,  41,  ...,   8, 149,   3],
        [ 56,   9,  28,  ..., 708,   1, 152],
        [ 24,  13,   1,  ..., 148,   1,  86],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])

tensor([1, 1, 4, 2, 0, 2, 0, 3, 2, 4, 0, 3, 1, 3, 3, 1, 4, 1, 4, 4, 3, 3, 0, 1,
        4, 3, 1, 1, 1, 0, 2,

In [100]:
import numpy as np

In [101]:
X = np.array(([[3,5], [5, 1], [10,2]]), dtype=float)
y = np.array(([75], [82], [93]), dtype=float)

In [102]:
# normalize

X = X / np.amax(X, axis=0)
y = y/100

In [103]:
X

array([[0.3, 1. ],
       [0.5, 0.2],
       [1. , 0.4]])

In [104]:
y

array([[0.75],
       [0.82],
       [0.93]])

In [105]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoidPrime(x):
    return np.exp(-x)/((1+np.exp(-x))**2)


class NN(object):
    def __init__(self):
        self.inputLayerSize = 2
        self.hiddenLayerSize = 3
        self.outputLayerSize = 1
        
        self.W1 = np.random.rand(self.inputLayerSize, self.hiddenLayerSize)
        self.W2 = np.random.rand(self.hiddenLayerSize, self.outputLayerSize)
        
        #self.W1 = np.zeros((self.inputLayerSize, self.hiddenLayerSize))
        #self.W2 = np.zeros(((self.hiddenLayerSize, self.outputLayerSize)))
        
    def forward(self, X):
        self.z2 = np.dot(X, self.W1) # broadcasting
        self.a2 = sigmoid(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        self.y_hat = sigmoid(self.z3)
        return self.y_hat
    
    
    def costFunctionPrime(self, X, y):
        self.yHat = self.forward(X)
        
        delta3 = np.multiply(-(y-self.yHat), sigmoidPrime(self.z3))
        dJdW2 = np.dot(self.a2.T, delta3)
        

In [106]:
nn = NN(X, y)

In [108]:
nn.forward()

array([[0.69816825],
       [0.68668206],
       [0.70690268]])

In [52]:
np.zeros((2,2), dtype=float)

array([[0., 0.],
       [0., 0.]])

# results

In [36]:
import glob
import os
from pathlib import Path
from collections import defaultdict
import numpy as np

In [48]:
dir_name = "fasttext_simple"
path = f"../results/fasttext/{dir_name}"

results = defaultdict(list)
for file in Path(path).rglob("*.txt"):
    with open(file, "r") as f:
        s = f.read()
        s = s[-6:-1]
        s = s.replace(" ", "")
        s = float(s)
    key = file.stem[len(dir_name)+3:]
    results[key].append(s)
    
means = {}
for k, v in dict(results).items():
    means[k] = np.mean(v)
means_sorted = sorted(means.items(), key=lambda x: x[1], reverse=True)
means_sorted

[('bs128_mf25000_lr0.01', 71.28666666666668),
 ('bs50_mf25000_lr0.01', 71.20666666666666),
 ('bs50_mf50000_lr0.01', 71.09666666666668),
 ('bs128_mf50000_lr0.01', 70.86666666666666),
 ('bs50_mf25000_lr0.001', 70.76666666666667),
 ('bs50_mf50000_lr0.001', 70.66000000000001),
 ('bs128_mf50000_lr0.001', 69.89),
 ('bs128_mf25000_lr0.001', 69.12333333333333),
 ('bs50_mf25000_lr0.1', 65.73333333333333),
 ('bs50_mf50000_lr0.1', 65.61)]