In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time
from tqdm import tqdm

# Загрузим данные

In [None]:
data = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")
data.head(2)

In [None]:
X = data["text"]
y = data["airline_sentiment"]

# Мелкая предобработка

In [None]:
import re

X = X.apply(lambda s:s.lower())
X = X.apply(lambda s:s.replace("i\'m", "i am"))
X = X.apply(lambda s:s.replace("\'re"," are"))
X = X.apply(lambda s:s.replace("\'ll", " will"))
X = X.apply(lambda s:s.replace("\'ve"," have"))
X = X.apply(lambda s:s.replace("\'s", " is"))
X = X.apply(lambda s:s.replace("#",""))
X = X.apply(lambda s:re.sub("@[a-zA-Z]*","",s))

sentiment_to_num=dict()
sentiment_to_num['negative']=0
sentiment_to_num['neutral']=1
sentiment_to_num['positive']=2

num_to_sentiment=['negative','neutral','positive']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.1, stratify=y_train)

# Построение словаря

In [None]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in X_train:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1,max_size=20000)

# Построение списка токенов по тексту

In [None]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: sentiment_to_num[x]

# Вот тут проблема. Списки токенов в датасете неодинаковой длины

In [None]:
max_len=200 # хочу чтобы было максимально 200 токенов
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, X,y):
        super().__init__()
        self.X = np.array(X)
        self.y = np.array(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self,idx):
        return (text_pipeline(self.X[idx]),label_pipeline(self.y[idx]))

In [None]:
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm

class LSTMClassifier(nn.Module):
    def __init__(self,dimension=128):
        super().__init__()
        self.embedding = nn.EmbeddingBag(num_embeddings = len(vocab),embedding_dim=300,sparse=True)
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(2*dimension, 2*dimension)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(2*dimension,3)

    def forward(self, text, text_len):
        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        
        text_fea = self.drop(out_reduced)
        text_fea = self.act1(self.fc1(text_fea))
        text_fea = self.fc2(text_fea)

        return text_fea

In [None]:
import torch
device = torch.device('cuda')

In [None]:
clf = LSTMClassifier()

In [None]:
train_dataset = SentimentDataset(X_train, y_train)
val_dataset = SentimentDataset(X_val, y_val)
test_dataset = SentimentDataset(X_test, y_test)

from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from torch import optim
optimizer = torch.optim.Adam(params = clf.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_epoch(model,epoch):
    dataset_size=len(train_dataset)
    print(f"Epoch#{epoch}. Train")
    
    start_time=time.time()
    model.train()
    
    running_loss=0.0   #накопление лосса
    running_corrects=0 #накопление для accuracy
    
    epoch_loss=0.0
    epoch_acc=0.0
    for inputs,labels in tqdm( train_loader):
        inputs=inputs.to(device)
        
        labels=labels.to(device) #передаем батч на GPU(cuda)
        optimizer.zero_grad()
        
        outputs=model(inputs,len(inputs))
        _,preds=torch.max(outputs,dim=1)
        loss=criterion(outputs,labels)
        loss.backward() # обратное распостранение градиента
        optimizer.step() # шаг оптимизатора
        #scheduler.step(loss) #планировщик learning rate
        running_loss+=loss.item()*inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
    
    epoch_loss = running_loss / dataset_size
    epoch_acc = running_corrects / dataset_size
    
    print(f'Loss: { epoch_loss } Acc: { epoch_acc }')
    print(f"Epoch#{epoch} (Train) completed. {round(time.time()-start_time,3)}s ")
    return model, epoch_loss, epoch_acc

In [None]:
def valid_epoch(model,epoch):
    dataset_size=len(val_dataset)
    print(f"Epoch#{epoch}. Validation")
    start_time=time.time()
    model.eval()
    running_loss=0.0 # накопление лосса
    running_corrects=0
    
    epoch_loss=0.0
    epoch_acc=0.0
    with torch.no_grad():
        for inputs,labels in tqdm( val_loader):
            inputs=inputs.to(device)
            labels=labels.to(device) #передаем батч на GPU(cuda)
        
            outputs=model(inputs, len(inputs))
            _,preds=torch.max(outputs,dim=1)
            loss=criterion(outputs,labels)
        
            running_loss+=loss.item()*inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
    
    epoch_loss = running_loss / dataset_size
    epoch_acc = running_corrects / dataset_size
    
    print(f'Loss: { epoch_loss } Acc: { epoch_acc }')
    print(f"Epoch#{epoch} (Validation) completed. {round(time.time()-start_time,3)}s ")
    return model, epoch_loss, epoch_acc

In [None]:
best_model=clf
best_acc=0.0
best_epoch=1


train_acc_history=[]
val_acc_history=[]

num_epochs = 10

for epoch in range(1,num_epochs+1):
    #тренировка
    clf, train_loss, train_acc=train_epoch(clf,epoch)
    train_loss_history.append(train_loss)
    train_acc_history.append(train_acc)
    #валидация
    clf, val_loss, val_acc=valid_epoch(clf,epoch)
    val_loss_history.append(val_loss)
    val_acc_history.append(val_acc)
    
    if(val_acc>best_acc):
        best_acc=val_acc
        best_model=model_ft
        best_epoch=epoch
