---

# Семантическая классификация твитов


In [None]:
!gdown https://drive.google.com/uc?id=1eE1FiUkXkcbw0McId4i7qY-L8hH-_Qph&export=download
!unzip archive.zip

In [None]:
import math
import random
import string

import numpy as np
import pandas as pd
import seaborn as sns

import torch
import torch.nn as nn 
import nltk
import gensim
import gensim.downloader as api

In [None]:
random.seed(42)
np.random.seed(42)
torch.random.manual_seed(42)
torch.cuda.random.manual_seed(42)
torch.cuda.random.manual_seed_all(42)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin", header=None, names=["emotion", "id", "date", "flag", "user", "text"])

In [None]:
data.head()

In [None]:
examples = data["text"].sample(10)
print("\n".join(examples))

In [None]:
indexes = np.arange(data.shape[0])
np.random.shuffle(indexes)
dev_size = math.ceil(data.shape[0] * 0.8)

dev_indexes = indexes[:dev_size]
test_indexes = indexes[dev_size:]

dev_data = data.iloc[dev_indexes]
test_data = data.iloc[test_indexes]

dev_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

## Text processing

In [None]:
tokenizer = nltk.WordPunctTokenizer()
line = tokenizer.tokenize(dev_data["text"][0].lower())
print(" ".join(line))

In [None]:
filtered_line = [w for w in line if all(c not in string.punctuation for c in w) and len(w) > 3]
print(" ".join(filtered_line))

Model loading

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
emb_line = [word2vec.get_vector(w) for w in filtered_line if w in word2vec]
print(sum(emb_line).shape)

Ebmeddings normalization 

In [None]:
mean = np.mean(word2vec.vectors, 0)
std = np.std(word2vec.vectors, 0)
norm_emb_line = [(word2vec.get_vector(w) - mean) / std for w in filtered_line if w in word2vec and len(w) > 3]
print(sum(norm_emb_line).shape)
print([all(norm_emb_line[i] == emb_line[i]) for i in range(len(emb_line))])

In [None]:
from torch.utils.data import Dataset, random_split


class TwitterDataset(Dataset):
    def __init__(self, data: pd.DataFrame, feature_column: str, target_column: str, word2vec: gensim.models.Word2Vec):
        self.tokenizer = nltk.WordPunctTokenizer()
        
        self.data = data

        self.feature_column = feature_column
        self.target_column = target_column

        self.word2vec = word2vec

        self.label2num = lambda label: 0 if label == 0 else 1
        self.mean = np.mean(word2vec.vectors, axis=0)
        self.std = np.std(word2vec.vectors, axis=0)

    def __getitem__(self, item):
        text = self.data[self.feature_column][item]
        label = self.label2num(self.data[self.target_column][item])

        tokens = self.get_tokens_(text)
        embeddings = self.get_embeddings_(tokens)

        return {"feature": embeddings, "target": label}

    def get_tokens_(self, text):
        line = self.tokenizer.tokenize(text.lower())
        filtered_line = [w for w in line if all(c not in string.punctuation for c in w) and len(w) > 3]

        return filtered_line

    def get_embeddings_(self, tokens):
        embeddings = [(word2vec.get_vector(w) - self.mean) / self.std for w in tokens \
                      if w in word2vec and len(w) > 3]

        if len(embeddings) == 0:
            embeddings = np.zeros((1, self.word2vec.vector_size))
        else:
            embeddings = np.array(embeddings)
            if len(embeddings.shape) == 1:
                embeddings = embeddings.reshape(-1, 1)

        return embeddings

    def __len__(self):
        return self.data.shape[0]

In [None]:
dev = TwitterDataset(dev_data, "text", "emotion", word2vec)

## Average embedding 

In [None]:
indexes = np.arange(len(dev))
np.random.shuffle(indexes)
example_indexes = indexes[::1000]

examples = {"features": [np.sum(dev[i]["feature"], axis=0) for i in example_indexes], 
            "targets": [dev[i]["target"] for i in example_indexes]}
print(len(examples["features"]))

In [None]:
from sklearn.decomposition import PCA


pca = PCA(n_components=2)      
examples["transformed_features"] = pca.fit_transform(examples['features'])

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [None]:
draw_vectors(
    examples["transformed_features"][:, 0], 
    examples["transformed_features"][:, 1], 
    color=[["red", "blue"][t] for t in examples["targets"]]
    )

In [None]:
from torch.utils.data import DataLoader


batch_size = 1024
num_workers = 4

def average_emb(batch):
    features = [np.mean(b["feature"], axis=0) for b in batch]
    targets = [b["target"] for b in batch]

    return {"features": torch.FloatTensor(features), "targets": torch.LongTensor(targets)}


train_size = math.ceil(len(dev) * 0.8)

train, valid = random_split(dev, [train_size, len(dev) - train_size])

train_loader = DataLoader(train, batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True, collate_fn=average_emb)
valid_loader = DataLoader(valid, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False, collate_fn=average_emb)

In [None]:
from tqdm.notebook import tqdm


def training(model, optimizer, criterion, train_loader, epoch, device="cpu"):
    pbar = tqdm(train_loader, desc=f"Epoch {e + 1}. Train Loss: {0}")
    model.train()
    for batch in pbar:
        features = batch["features"].to(device)
        targets = batch["targets"].to(device)

        preds = model(features)
        loss = criterion(preds, targets) 
        loss.backward()
        optimizer.step()

        pbar.set_description(f"Epoch {e + 1}. Train Loss: {loss:.4}")
    

def testing(model, criterion, test_loader, device="cpu"):
    pbar = tqdm(test_loader, desc=f"Test Loss: {0}, Test Acc: {0}")
    mean_loss = 0
    mean_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in pbar:
            features = batch["features"].to(device)
            targets = batch["targets"].to(device)

            preds = model(features)
            loss = criterion(preds, targets) 
            acc = torch.sum(torch.argmax(preds, dim=1) == targets).float() / len(targets)

            mean_loss += loss.item()
            mean_acc += acc.item()

            pbar.set_description(f"Test Loss: {loss:.4}, Test Acc: {acc:.4}")

    pbar.set_description(f"Test Loss: {mean_loss / len(test_loader):.4}, Test Acc: {mean_acc / len(test_loader):.4}")

    return {"Test Loss": mean_loss / len(test_loader), "Test Acc": mean_acc / len(test_loader)}

In [None]:
from torch.optim import Adam
import torch.nn as nn

class NLP_model(nn.Module):
    def __init__(self, vector_size, num_classes):
        super().__init__()
        self.layer_1 = nn.Sequential(
            nn.Linear(vector_size, 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_classes)
        )
    def forward(self, x):
        return self.layer_1(x)

vector_size = dev.word2vec.vector_size
num_classes = 2
lr = 1e-4
num_epochs = 1

model =  NLP_model(vector_size, num_classes)
model = model.cuda()
criterion = nn.CrossEntropyLoss() # Твой лосс
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
best_metric = np.inf
for e in range(num_epochs):
    training(model, optimizer, criterion, train_loader, e, device)
    log = testing(model, criterion, valid_loader, device)
    print(log)
    if log["Test Loss"] < best_metric:
        torch.save(model.state_dict(), "model.pt")
        best_metric = log["Test Loss"]

In [None]:
test_loader = DataLoader(
    TwitterDataset(test_data, "text", "emotion", word2vec), 
    batch_size=batch_size, 
    num_workers=num_workers, 
    shuffle=False,
    drop_last=False, 
    collate_fn=average_emb)

model.load_state_dict(torch.load("model.pt", map_location=device))

print(testing(model, criterion, test_loader, device=device))

## Embeddings for unknown words 

In [None]:
from torch.utils.data import Dataset, random_split


class TwitterDataset_2(Dataset):
    def __init__(self, data: pd.DataFrame, feature_column: str, target_column: str, word2vec: gensim.models.Word2Vec):
        self.tokenizer = nltk.WordPunctTokenizer()
        
        self.data = data

        self.feature_column = feature_column
        self.target_column = target_column

        self.word2vec = word2vec

        self.label2num = lambda label: 0 if label == 0 else 1
        self.mean = np.mean(word2vec.vectors, axis=0)
        self.std = np.std(word2vec.vectors, axis=0)

    def __getitem__(self, item):
        text = self.data[self.feature_column][item]
        label = self.label2num(self.data[self.target_column][item])

        tokens = self.get_tokens_(text)
        embeddings = self.get_embeddings_(tokens)

        return {"feature": embeddings, "target": label}

    def get_tokens_(self, text):
    
        line = self.tokenizer.tokenize(text.lower())
        filtered_line = [w for w in line if all(c not in string.punctuation for c in w) and len(w) >= 3]

        return filtered_line

    def get_embeddings_(self, tokens):
         
        embeddings = []
        for idx, w in enumerate(tokens):
            if w in word2vec and len(w) > 3:
                embeddings.append((word2vec.get_vector(w) - self.mean) / self.std)
            elif w not in word2vec:
                embeddings_left = tokens[0:idx]
                embeddings_right = tokens[idx + 1::]
                embeddings_sum = embeddings_left + embeddings_right

                for another_word in embeddings_sum:
                    if another_word in word2vec and len(another_word) > 3:
                        embeddings.append((word2vec.get_vector(another_word) - self.mean) / self.std)

        if len(embeddings) == 0:
            embeddings = np.zeros((1, self.word2vec.vector_size))
        else:
            embeddings = np.array(embeddings)
            if len(embeddings.shape) == 1:
                embeddings = embeddings.reshape(-1, 1)

        return embeddings


    def __len__(self):
        return self.data.shape[0]

In [None]:
dev_2 = TwitterDataset_2(dev_data, "text", "emotion", word2vec)

In [None]:
train_size_2 = math.ceil(len(dev_2) * 0.8)

train_2, valid_2 = random_split(dev_2, [train_size_2, len(dev_2) - train_size_2])

train_loader_2 = DataLoader(train_2, batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True, collate_fn=average_emb)
valid_loader_2 = DataLoader(valid_2, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False, collate_fn=average_emb)

In [None]:
vector_size = dev_2.word2vec.vector_size
num_classes = 2
lr = 1e-4
num_epochs = 1

model_2 =  NLP_model(vector_size, num_classes)
model_2 = model_2.cuda()
criterion = nn.CrossEntropyLoss() 
optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=lr)

In [None]:
best_metric = np.inf
for e in range(num_epochs):
    training(model_2, optimizer_2, criterion, train_loader_2, e, device)
    log = testing(model_2, criterion, valid_loader_2, device)
    print(log)
    if log["Test Loss"] < best_metric:
        torch.save(model_2.state_dict(), "model.pt")
        best_metric = log["Test Loss"]

In [None]:
test_loader_2 = DataLoader(
    TwitterDataset(test_data, "text", "emotion", word2vec), 
    batch_size=batch_size, 
    num_workers=num_workers, 
    shuffle=False,
    drop_last=False, 
    collate_fn=average_emb)

model_2.load_state_dict(torch.load("model.pt", map_location=device))

print(testing(model_2, criterion, test_loader_2, device=device))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=200)
qzz = vectorizer.fit_transform(dev_data['text']).toarray()

In [None]:
from torch.utils.data import Dataset, random_split


class TwitterDataset(Dataset):
    def __init__(self, data: pd.DataFrame, feature_column: str, target_column: str, word2vec: gensim.models.Word2Vec):
        self.tokenizer = nltk.WordPunctTokenizer()
        
        self.data = data

        self.feature_column = feature_column
        self.target_column = target_column

        self.word2vec = word2vec

        self.label2num = lambda label: 0 if label == 0 else 1
        self.mean = np.mean(word2vec.vectors, axis=0)
        self.std = np.std(word2vec.vectors, axis=0)

    def __getitem__(self, item):
        text = self.data[self.feature_column][item]
        label = self.label2num(self.data[self.target_column][item])

        tokens = self.get_tokens_(text)
        embeddings = self.get_embeddings_(tokens)

        return {"feature": embeddings, "target": label}

    def get_tokens_(self, text):
        line = self.tokenizer.tokenize(text.lower())
        filtered_line = [w for w in line if all(c not in string.punctuation for c in w) and len(w) > 3]

        return filtered_line

    def get_embeddings_(self, tokens):
        
        embeddings = [(word2vec.get_vector(w) - self.mean) / self.std for w in tokens]
        if len(embeddings) == 0:
            embeddings = np.zeros((1, self.word2vec.vector_size)) + qzz
        else:
            embeddins = np.array(embeddings) + qzz
            if len(embeddings.shape) == 1:
                embeddings = embeddings.reshape(-1, 1)

        return embeddings

    def __len__(self):
        return self.data.shape[0]

In [None]:
dev_3 = TwitterDataset(dev_data, "text", "emotion", word2vec)

In [None]:
train_size_3 = math.ceil(len(dev_3) * 0.8)

train_3, valid_3 = random_split(dev_3, [train_size_3, len(dev_3) - train_size_3])

train_loader_3 = DataLoader(train_3, batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True, collate_fn=average_emb)
valid_loader_3 = DataLoader(valid_3, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False, collate_fn=average_emb)

In [None]:
vector_size = dev_3.word2vec.vector_size
num_classes = 2
lr = 1e-4
num_epochs = 1

model_3 =  NLP_model(vector_size, num_classes)
model_3 = model_3.cuda()
criterion = nn.CrossEntropyLoss()
optimizer_3 = torch.optim.Adam(model_3.parameters(), lr=lr)

In [None]:
best_metric = np.inf
for e in range(num_epochs):
    training(model_3, optimizer_3, criterion, train_loader_3, e, device)
    log = testing(model_3, criterion, valid_loader_3, device)
    print(log)
    if log["Test Loss"] < best_metric:
        torch.save(model_3.state_dict(), "model.pt")
        best_metric = log["Test Loss"]

In [None]:
test_loader_3 = DataLoader(
    TwitterDataset(test_data, "text", "emotion", word2vec), 
    batch_size=batch_size, 
    num_workers=num_workers, 
    shuffle=False,
    drop_last=False, 
    collate_fn=average_emb)

model_3.load_state_dict(torch.load("model.pt", map_location=device))

print(testing(model_3, criterion, test_loader_3, device=device))