In [72]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch.optim.lr_scheduler import StepLR




In [2]:
data = pd.read_csv('original_data.txt',sep='|')

In [3]:
data.genre

0               drama 
1            thriller 
2               adult 
3               drama 
4               drama 
             ...      
54209          comedy 
54210          horror 
54211     documentary 
54212          comedy 
54213         history 
Name: genre, Length: 54214, dtype: object

In [4]:
category_counts = data.groupby('genre').size().reset_index(name='Count')

print(category_counts)

            genre  Count
0         action    1315
1          adult     590
2      adventure     775
3      animation     498
4      biography     265
5         comedy    7447
6          crime     505
7    documentary   13096
8          drama   13613
9         family     784
10       fantasy     323
11     game-show     194
12       history     243
13        horror    2204
14         music     731
15       musical     277
16       mystery     319
17          news     181
18    reality-tv     884
19       romance     672
20        sci-fi     647
21         short    5073
22         sport     432
23     talk-show     391
24      thriller    1591
25           war     132
26       western    1032


In [5]:
# Use LabelEncoder to encode labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['genre'])

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(f'train_data len: {len(train_data)} \ntest_data len: {len(test_data)}')


train_data len: 43371 
test_data len: 10843


In [6]:
def iterate_rows(df):
    for index, row in df.iterrows():
        item = row['description'],row['label']
        yield item

In [7]:
next(iterate_rows(train_data))

(' Sex. Betrayal. Seduction. Manipulation. Addiction. Love. Hate. Vanity. Obsession. Words that brand each of the 8 characters of SHADE. Linda, Ty, Aurora, Delilah, Cassius, D-Low, Sega and John. Each a master at presenting carefully woven personas and each orchestrating solitary gambits of depravity. SHADE transcends convention, decorum and time, plunging the viewer down to the abyss in which we all reside but few embrace. Between the darkness and the light exists the shade. In that shade lives our characters.',
 8)

In [8]:
tokenizer = get_tokenizer('basic_english')
train_iter = iterate_rows(train_data)
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
vocab(['my', 'name', 'is', 'mohammed'])


[593, 349, 9, 13213]

In [17]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _text, _label  in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)

    return text_list.to(device),label_list.to(device), offsets.to(device)



train_iter = iterate_rows(train_data)
dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [86]:
# Define your model, loss function, and optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False,include_last_offset=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


    
train_iter = iterate_rows(train_data)
num_class = len(label_encoder.classes_)
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

# model = TextClassificationModel(vocab_size, emsize, num_class)

# scheduler = StepLR(optimizer, step_size=5, gamma=0.5)



def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, label, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

        # scheduler.step()


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text,label, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count


In [89]:
# Hyperparameters
EPOCHS = 20  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = iterate_rows(train_data), iterate_rows(test_data)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/  644 batches | accuracy    0.409
-----------------------------------------------------------
| end of epoch   1 | time:  7.57s | valid accuracy    0.431 
-----------------------------------------------------------
| epoch   2 |   500/  644 batches | accuracy    0.459
-----------------------------------------------------------
| end of epoch   2 | time:  7.68s | valid accuracy    0.464 
-----------------------------------------------------------
| epoch   3 |   500/  644 batches | accuracy    0.491
-----------------------------------------------------------
| end of epoch   3 | time:  7.82s | valid accuracy    0.491 
-----------------------------------------------------------
| epoch   4 |   500/  644 batches | accuracy    0.519
-----------------------------------------------------------
| end of epoch   4 | time:  7.95s | valid accuracy    0.499 
-----------------------------------------------------------
| epoch   5 |   500/  644 batches | accuracy    0.540
------

In [90]:
print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader)
print("test accuracy {:8.3f}".format(accu_test))

Checking the results of test dataset.
test accuracy    0.562


In [105]:
class_dict = dict(zip(range(len(label_encoder.classes_)),label_encoder.classes_))

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 


ex_text_str = "Three buddies wake up from a bachelor party in Las Vegas, with no memory of the previous night and the bachelor missing. They make their way around the city in order to find their friend before his wedding."

model = model.to("cpu")

print("This is a %s movie" % class_dict[predict(ex_text_str, text_pipeline)])

This is a  comedy  movie
