In [None]:
# config.py

class Config(object):
    embed_size = 300
    hidden_layers = 1
    hidden_size = 64
    output_size = 4
    max_epochs = 2
    hidden_size_linear = 64
    lr = 0.5
    batch_size = 128
    seq_len = None # Sequence length for RNN
    dropout_keep = 0.8
    max_sen_len = 100

In [None]:
!pip install python==3.5.0
!pip install pandas==0.23.4
!pip install numpy==1.15.2
!pip install spacy==2.0.13
!pip install torch==0.4.1.post2
!pip install torchtext==0.3.1


In [None]:
# utils.py

import torch
from torchtext import data
from torchtext.vocab import Vectors
import spacy
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from torchtext import data


class Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.test_iterator = None
        self.val_iterator = None
        self.vocab = []
        self.word_embeddings = {}

    def parse_label(self, label):
        '''
        Get the actual labels from label string
        Input:
            label (string) : labels of the form '__label__2'
        Returns:
            label (int) : integer value corresponding to label string
        '''
        return int(label.strip()[-1])

    def get_pandas_df(self, filename):
        '''
        Load the data into Pandas.DataFrame object
        This will be used to convert data to torchtext object
        '''
        with open(filename, 'r') as datafile:
            data = [line.strip().split(',', maxsplit=1) for line in datafile]
            data_text = list(map(lambda x: x[1], data))
            data_label = list(map(lambda x: self.parse_label(x[0]), data))

        full_df = pd.DataFrame({"text":data_text, "label":data_label})
        return full_df

    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            w2v_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''

        NLP = spacy.load('en_core_web_sm')
        tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "]

        # Creating Field for data
        TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text",TEXT),("label",LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)

        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        print ("Loaded {} training examples".format(len(train_data)))
        print ("Loaded {} test examples".format(len(test_data)))
        print ("Loaded {} validation examples".format(len(val_data)))


def evaluate_model(model, iterator):
    all_preds = []
    all_y = []
    for idx,batch in enumerate(iterator):
        if torch.cuda.is_available():
            x = batch.text.cuda()
        else:
            x = batch.text
        y_pred = model(x)
        predicted = torch.max(y_pred.cpu().data, 1)[1] + 1
        all_preds.extend(predicted.numpy())
        all_y.extend(batch.label.numpy())
    score = accuracy_score(all_y, np.array(all_preds).flatten())
    return score

In [None]:
# model.py

import torch
from torch import nn
import numpy as np
from torch.nn import functional as F


class RCNN(nn.Module):
    def __init__(self, config, vocab_size, word_embeddings):
        super(RCNN, self).__init__()
        self.config = config

        # Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, self.config.embed_size)
        self.embeddings.weight = nn.Parameter(word_embeddings, requires_grad=False)

        # Bi-directional LSTM for RCNN
        self.lstm = nn.LSTM(input_size = self.config.embed_size,
                            hidden_size = self.config.hidden_size,
                            num_layers = self.config.hidden_layers,
                            dropout = self.config.dropout_keep,
                            bidirectional = True)

        self.dropout = nn.Dropout(self.config.dropout_keep)

        # Linear layer to get "convolution output" to be passed to Pooling Layer
        self.W = nn.Linear(
            self.config.embed_size + 2*self.config.hidden_size,
            self.config.hidden_size_linear
        )

        # Tanh non-linearity
        self.tanh = nn.Tanh()

        # Fully-Connected Layer
        self.fc = nn.Linear(
            self.config.hidden_size_linear,
            self.config.output_size
        )

        # Softmax non-linearity
        self.softmax = nn.Softmax()

    def forward(self, x):
        # x.shape = (seq_len, batch_size)
        embedded_sent = self.embeddings(x)
        # embedded_sent.shape = (seq_len, batch_size, embed_size)

        lstm_out, (h_n,c_n) = self.lstm(embedded_sent)
        # lstm_out.shape = (seq_len, batch_size, 2 * hidden_size)

        input_features = torch.cat([lstm_out,embedded_sent], 2).permute(1,0,2)
        # final_features.shape = (batch_size, seq_len, embed_size + 2*hidden_size)

        linear_output = self.tanh(
            self.W(input_features)
        )
        # linear_output.shape = (batch_size, seq_len, hidden_size_linear)

        linear_output = linear_output.permute(0,2,1) # Reshaping fot max_pool

        max_out_features = F.max_pool1d(linear_output, linear_output.shape[2]).squeeze(2)
        # max_out_features.shape = (batch_size, hidden_size_linear)

        max_out_features = self.dropout(max_out_features)
        final_out = self.fc(max_out_features)
        return self.softmax(final_out)

    def add_optimizer(self, optimizer):
        self.optimizer = optimizer

    def add_loss_op(self, loss_op):
        self.loss_op = loss_op

    def reduce_lr(self):
        print("Reducing LR")
        for g in self.optimizer.param_groups:
            g['lr'] = g['lr'] / 2

    def run_epoch(self, train_iterator, val_iterator, epoch):
        train_losses = []
        val_accuracies = []
        losses = []

        # Reduce learning rate as number of epochs increase
        if (epoch == int(self.config.max_epochs/3)) or (epoch == int(2*self.config.max_epochs/3)):
            self.reduce_lr()

        for i, batch in enumerate(train_iterator):
            self.optimizer.zero_grad()
            if torch.cuda.is_available():
                x = batch.text.cuda()
                y = (batch.label - 1).type(torch.cuda.LongTensor)
            else:
                x = batch.text
                y = (batch.label - 1).type(torch.LongTensor)
            y_pred = self.__call__(x)
            loss = self.loss_op(y_pred, y)
            loss.backward()
            losses.append(loss.data.cpu().numpy())
            self.optimizer.step()

            if i % 100 == 0:
                print("Iter: {}".format(i+1))
                avg_train_loss = np.mean(losses)
                train_losses.append(avg_train_loss)
                print("\tAverage training loss: {:.5f}".format(avg_train_loss))
                losses = []

                # Evalute Accuracy on validation set
                val_accuracy = evaluate_model(self, val_iterator)
                print("\tVal Accuracy: {:.4f}".format(val_accuracy))
                self.train()

        return train_losses, val_accuracies

In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import spacy
from spacy.cli.download import download
download(model="en")

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2023-09-03 10:47:23--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2023-09-03 10:47:23--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2023-09-03 10:47:24--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/

In [None]:
!unzip glove*.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [None]:
%ls

ag_news.test   glove.840B.300d.txt  processed_dev.txt   processed_train.txt
ag_news.train  glove.840B.300d.zip  processed_test.txt  [0m[01;34msample_data[0m/


In [None]:
!pip install --upgrade torchtext




In [None]:
# train.py
import spacy
nlp = spacy.load('en_core_web_sm')
import sys
import torch.optim as optim
from torch import nn
import torch
from torchtext import data
if __name__=='__main__':
    config = Config()
    train_file = 'ag_news.train'
    test_file = 'ag_news.test'
    w2v_file = 'glove.840B.300d.txt'

    dataset = Dataset(config)
    dataset.load_data(w2v_file, train_file, test_file)

    # Create Model with specified optimizer and loss function
    ##############################################################
    model = RCNN(config, len(dataset.vocab), dataset.word_embeddings)
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=config.lr)
    NLLLoss = nn.NLLLoss()
    model.add_optimizer(optimizer)
    model.add_loss_op(NLLLoss)
    ##############################################################

    train_losses = []
    val_accuracies = []

    for i in range(config.max_epochs):
        print ("Epoch: {}".format(i))
        train_loss,val_accuracy = model.run_epoch(dataset.train_iterator, dataset.val_iterator, i)
        train_losses.append(train_loss)
        val_accuracies.append(val_accuracy)

    train_acc = evaluate_model(model, dataset.train_iterator)
    val_acc = evaluate_model(model, dataset.val_iterator)
    test_acc = evaluate_model(model, dataset.test_iterator)

    print ('Final Training Accuracy: {:.4f}'.format(train_acc))
    print ('Final Validation Accuracy: {:.4f}'.format(val_acc))
    print ('Final Test Accuracy: {:.4f}'.format(test_acc))

Loaded 96000 training examples
Loaded 7600 test examples
Loaded 24000 validation examples




Epoch: 0
Reducing LR


  return self.softmax(final_out)


Iter: 1
	Average training loss: -0.24976
	Val Accuracy: 0.2545


  return self.softmax(final_out)


Iter: 101
	Average training loss: -0.36984
	Val Accuracy: 0.6299


  return self.softmax(final_out)


Iter: 201
	Average training loss: -0.62754
	Val Accuracy: 0.7825


  return self.softmax(final_out)


Iter: 301
	Average training loss: -0.74235
	Val Accuracy: 0.8211


  return self.softmax(final_out)


Iter: 401
	Average training loss: -0.78350
	Val Accuracy: 0.8352


  return self.softmax(final_out)


Iter: 501
	Average training loss: -0.80834
	Val Accuracy: 0.8413


  return self.softmax(final_out)


Iter: 601
	Average training loss: -0.81284
	Val Accuracy: 0.8430


  return self.softmax(final_out)


Iter: 701
	Average training loss: -0.82088
	Val Accuracy: 0.8495


  return self.softmax(final_out)


Epoch: 1
Reducing LR
Iter: 1
	Average training loss: -0.87290
	Val Accuracy: 0.8498


  return self.softmax(final_out)


Iter: 101
	Average training loss: -0.82857
	Val Accuracy: 0.8508


  return self.softmax(final_out)


Iter: 201
	Average training loss: -0.83399
	Val Accuracy: 0.8530


  return self.softmax(final_out)


Iter: 301
	Average training loss: -0.83614
	Val Accuracy: 0.8525


  return self.softmax(final_out)


Iter: 401
	Average training loss: -0.83614
	Val Accuracy: 0.8580


  return self.softmax(final_out)


Iter: 501
	Average training loss: -0.84031
	Val Accuracy: 0.8567


  return self.softmax(final_out)


Iter: 601
	Average training loss: -0.83659
	Val Accuracy: 0.8555


  return self.softmax(final_out)


Iter: 701
	Average training loss: -0.83677
	Val Accuracy: 0.8566


  return self.softmax(final_out)
  return self.softmax(final_out)
  return self.softmax(final_out)


Final Training Accuracy: 0.8569
Final Validation Accuracy: 0.8611
Final Test Accuracy: 0.8614


In [None]:
!pip install torchtext==0.3.1


Collecting torchtext==0.3.1
  Downloading torchtext-0.3.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m920.6 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed torchtext-0.3.1


In [None]:
!pip install torchtext==0.6 torch==1.5

In [None]:
# Manual inspection section
while True:
    custom_text = input("Enter a text to classify (or 'exit' to quit): ")
    if custom_text.lower() == 'exit':
        break

    # Tokenize and preprocess the custom text using the TEXT field
    custom_text = [token.text for token in nlp(custom_text)]

    # Convert tokens to indices using the model's vocabulary
    custom_text_indices = [dataset.vocab.stoi[token] for token in custom_text]

    # Convert to a PyTorch tensor and make a prediction
    with torch.no_grad():
        if torch.cuda.is_available():
            custom_text_tensor = torch.tensor(custom_text_indices, dtype=torch.long).cuda()
        else:
            custom_text_tensor = torch.tensor(custom_text_indices, dtype=torch.long)
        predicted_scores = model(custom_text_tensor.unsqueeze(0))
        predicted_label = torch.argmax(predicted_scores, dim=1).item()

    print(f"Predicted Label: {predicted_label}")


In [None]:
# Manual inspection section
while True:
    custom_text = input("Enter a text to classify (or 'exit' to quit): ")
    if custom_text.lower() == 'exit':
        break

    # Tokenize and preprocess the custom text using the TEXT field
    custom_text = [token.text for token in nlp(custom_text)]

    # Convert tokens to indices using the TEXT field's vocabulary
    custom_text_indices = [dataset.vocab.stoi[token] for token in custom_text]

    # Convert to a PyTorch tensor and make a prediction
    with torch.no_grad():
        if torch.cuda.is_available():
            custom_text_tensor = torch.tensor(custom_text_indices, dtype=torch.long).cuda()
        else:
            custom_text_tensor = torch.tensor(custom_text_indices, dtype=torch.long)
        predicted_scores = model(custom_text_tensor.unsqueeze(0))
        predicted_label = torch.argmax(predicted_scores, dim=1).item()

    print(f"Predicted Label: {predicted_label}")


Enter a text to classify (or 'exit' to quit): exit


In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Sample data
sample_train_articles = ["Artificial Intelligence (AI) has become indispensable across industries, enhancing decision-making, automating tasks, and driving innovation. In healthcare, AI aids diagnosis, while in business, it personalizes marketing. Its versatile applications underscore its significance in today's world.",
                         "Machine learning empowers businesses by analyzing data for insights, optimizing operations, and enhancing customer experiences. Leveraging predictive algorithms, companies can make informed decisions, streamline processes, and gain a competitive edge.",
                         "Work is evolving with remote collaboration, automation, and a flexible workforce. The future promises diverse work arrangements, with technology reshaping job roles and demanding adaptability.",
                         "The metaverse offers immersive digital experiences, creating new business prospects. Virtual reality, digital economies, and unique marketing channels will redefine commerce.",
                         "Climate change poses economic risks, impacting industries from agriculture to insurance. Urgent sustainability measures are crucial for mitigating economic disruptions.",
                         "Renewable energy, led by solar and wind, is revolutionizing global power generation. Clean, sustainable sources are reducing dependence on fossil fuels.",
                         "Innovations in food production and distribution are critical for ensuring global food security. Sustainable practices and technology-driven solutions will address this challenge.",
                         "Advanced transportation technologies, like autonomous vehicles and high-speed transit, are poised to revolutionize mobility, enhancing efficiency and sustainability.",
                         "Technology-driven healthcare is reshaping patient care through telemedicine, AI diagnostics, and personalized treatments. Patient-centric solutions are at the forefront of this transformation.",
                         "Technology is redefining education with online learning, personalized curricula, and lifelong learning opportunities. Accessible, adaptable learning methods are becoming the norm."]

sample_train_titles = ["The Importance of Artificial Intelligence in the Modern World",
         "How to Use Machine Learning to Improve Your Business",
         "The Future of Work: What Does It Look Like?",
         "The Rise of the Metaverse: What It Means for Business",
         "The Impact of Climate Change on the Economy",
         "The Future of Energy: How Renewables Will Power Our Planet",
         "The Future of Food: How We Can Feed the World",
         "The Future of Transportation: How We Can Move People and Goods More Efficiently",
         "The Future of Healthcare: How Technology Will Transform Medicine",
         "The Future of Education: How Technology Will Change the Way We Learn"]

# Tokenize and preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sample_train_articles)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in sample_train_articles:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=total_words)

model.fit(X, y, epochs=100, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f34b01411e0>

In [2]:
# Function to generate the title
def generate_title(text):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted_probs)

    # Convert the predicted index back to a word
    predicted_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            predicted_word = word
            break
    return predicted_word

# Test the model
test_article = "Innovations in food production and distribution are critical for ensuring global food security. Sustainable practices and technology-driven solutions will address this challenge."
predicted_title = generate_title(test_article)
print("Predicted Title:", predicted_title)


Predicted Title: challenge
