In [138]:
#deal with tensors
import torch   
import pandas as pd
#handling text data
from torchtext import data
from torch import nn   
#import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Device available for running: 
cpu


# Data Processing

In [217]:
SEED = 2022
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f4fcafd2a90>

In [218]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,Stressed
1,need see hair amp beard gat book appointment b...,Anxious
2,next time meet someone new dont ask ask love,Normal
3,surprise someone love give la senza gift box r...,Lonely
4,raise hand junhoes ocean lotion life rent free...,Normal


In [219]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,tweets
0,0,wish luck need start cooking
1,1,ino didnt need weapon need backup cause damn s...
2,2,good thing know walk away
3,3,say fat people get tire fast
4,4,need buy fucken car already


# Scikit-Learn Model

In [220]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'].values)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
df.head()

{'Anxious': 0, 'Lonely': 1, 'Normal': 2, 'Stressed': 3}


Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,3
1,need see hair amp beard gat book appointment b...,0
2,next time meet someone new dont ask ask love,2
3,surprise someone love give la senza gift box r...,1
4,raise hand junhoes ocean lotion life rent free...,2


In [293]:
traindf, validdf = train_test_split(df, test_size=0.2, shuffle=True, random_state=SEED)

In [294]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', max_features=200000)
cv.fit_transform(list(traindf['tweets'].values) + list(validdf['tweets'].values))

<29992x21659 sparse matrix of type '<class 'numpy.int64'>'
	with 250427 stored elements in Compressed Sparse Row format>

In [295]:
xtrain = cv.transform(traindf['tweets'].values)
xvalid = cv.transform(validdf['tweets'].values)
ytrain = traindf['labels'].values
yvalid = validdf['labels'].values

In [297]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(C= 10, multi_class='multinomial', solver='sag')

lr.fit(xtrain, ytrain)
ypred = lr.predict(xvalid)

accuracy_score(yvalid, ypred)




0.6326054342390398

# Pytorch Neural Network 

In [242]:
from torchtext.vocab import build_vocab_from_iterator

def tokens(data_iter):
    for index, row in data_iter.iterrows():
        #tokenize the tweets by removing spaces
        yield row['tweets'].split(" ")

# build a vocabulary dictionary of all words in train.csv with the torchtetxt.vocab module 
vocab = build_vocab_from_iterator(tokens(df), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [240]:
from torch.utils.data import DataLoader
# construct data loaders for training, validation data
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(int(_label))
         processed_text = torch.tensor(vocab(_text.split(" ")), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def loadData(data, batch_size, type):
  dataArr = []
  for index, row in data.iterrows():
    dataArr.append((row['labels'], row['tweets']))
  loader = DataLoader(dataArr, batch_size = batch_size, shuffle =True, collate_fn=collate_batch)
  return loader

In [305]:
from numpy.ma.core import outer
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
     
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, embed_dim)
        self.relu1 = nn.Sigmoid()
        self.fc2 = nn.Linear(embed_dim, num_class)
        self.act = nn.Sigmoid()
      
    def forward(self, text, offsets):

        out= self.embedding(text, offsets)
  
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        return self.act(out)

In [306]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [307]:
# Hyperparameters
EPOCHS = 10 # epoch
num_class = 4
vocab_size = len(vocab)
emsize = 64
LR = 5  # learning rate
BATCH_SIZE = 10 # batch size for training

model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
trainloader = loadData(traindf, BATCH_SIZE, 'train')
validloader = loadData(validdf, BATCH_SIZE, 'valid')

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(trainloader)
    accu_val = evaluate(validloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'validation accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 2400 batches | accuracy    0.437
| epoch   1 |  1000/ 2400 batches | accuracy    0.581
| epoch   1 |  1500/ 2400 batches | accuracy    0.648
| epoch   1 |  2000/ 2400 batches | accuracy    0.665
-----------------------------------------------------------
| end of epoch   1 | time:  3.55s | validation accuracy    0.673 
-----------------------------------------------------------
| epoch   2 |   500/ 2400 batches | accuracy    0.684
| epoch   2 |  1000/ 2400 batches | accuracy    0.679
| epoch   2 |  1500/ 2400 batches | accuracy    0.696
| epoch   2 |  2000/ 2400 batches | accuracy    0.716
-----------------------------------------------------------
| end of epoch   2 | time:  3.50s | validation accuracy    0.682 
-----------------------------------------------------------
| epoch   3 |   500/ 2400 batches | accuracy    0.694
| epoch   3 |  1000/ 2400 batches | accuracy    0.710
| epoch   3 |  1500/ 2400 batches | accuracy    0.710
| epoch   3 |  2000/ 2400 batches 

In [280]:
le_name_mapping_1 = dict([(value, key) for key, value in le_name_mapping.items()])
print(le_name_mapping_1)

test_data = test['tweets'].values
def predict(text):
    with torch.no_grad():
        text = torch.tensor(vocab(text.split(" ")))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

model = model.to("cpu")
# example of a prediction
print(test_data[9])
print("%s" %le_name_mapping_1[predict(test_data[9])])


{0: 'Anxious', 1: 'Lonely', 2: 'Normal', 3: 'Stressed'}
redecoing vertebreak need buy second one 
Lonely


In [276]:
test.head()

Unnamed: 0,id,tweets
0,0,wish luck need start cooking
1,1,ino didnt need weapon need backup cause damn s...
2,2,good thing know walk away
3,3,say fat people get tire fast
4,4,need buy fucken car already


In [285]:
# Gather predictions for testing data
predictions = []

for tweet in range(len(test_data)):
  pred = predictions.append(le_name_mapping_1[predict(test_data[tweet])])
test['predictions'] = predictions


In [288]:
test.head(10)

Unnamed: 0,id,tweets,predictions
0,0,wish luck need start cooking,Lonely
1,1,ino didnt need weapon need backup cause damn s...,Anxious
2,2,good thing know walk away,Normal
3,3,say fat people get tire fast,Stressed
4,4,need buy fucken car already,Lonely
5,5,asleep yet hello theres work people distress u...,Anxious
6,6,depression isnt always cry sit dark room also ...,Anxious
7,7,c9e23024 battle id need backup lvl 150 proto b...,Lonely
8,8,human need job cant exist amp make art chill cat,Anxious
9,9,redecoing vertebreak need buy second one,Lonely
