In [60]:
import numpy as np
import pandas as pd
import re
import nltk
import nltk.tokenize as tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import string
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saeedzou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saeedzou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\saeedzou\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saeedzou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
class MSCTD(Dataset):
    """
    :param root: root path of the dataset
    :param split: train, dev, test
    :param image_transform: transform for image
    :param text_transform: transform for text
    :param sentiment_transform: transform for sentiment
    :param has_data: dict, whether the dataset has image, text
    :param text_path: path of the text file
    :param image_path: path of the image folder
    :param sentiment_path: path of the sentiment file
    :param image_index_path: path of the image index file

    :return: combination of image, sentiment, text, image_index

    Example:
    >>> from torchvision import transforms
    >>> image_transform = transforms.Compose([
    >>>     transforms.Resize((640, 1280)),
    >>>     transforms.Lambda(lambda x: x.permute(1, 2, 0))
    >>> ])
    >>> text_transform = None
    >>> sentiment_transform = None
    >>> dataset = MSCTD(root='data', split='train', image_transform=image_transform,
    >>>                 text_transform=text_transform, sentiment_transform=sentiment_transform)
    >>> image, text, sentiment = dataset[0]

    """
    def __init__(self, root, split, image_transform=None, text_transform=None, sentiment_transform=None,
                 has_data={'image': True, 'text': True}, text_path=None, image_path=None, sentiment_path=None,
                 image_index_path=None):
        data_path = os.path.join(root, split)
        default_path = {
            'text': os.path.join(data_path, 'english_' + split + '.txt'),
            'image': os.path.join(data_path, 'image'),
            'sentiment': os.path.join(data_path, 'sentiment_' + split + '.txt'),
            'image_index': os.path.join(data_path, 'image_index_' + split + '.txt'),
        }
        self.image = [] if has_data['image'] else None
        self.image_transform = image_transform
        self.image_path = image_path if image_path else default_path['image']
        self.text = [] if has_data['text'] else None
        self.text_transform = text_transform
        self.text_path = text_path if text_path else default_path['text']
        self.sentiment_path = sentiment_path if sentiment_path else default_path['sentiment']
        self.image_index_path = image_index_path if image_index_path else default_path['image_index']
        self.sentiment = []
        self.image_index = []
        self.sentiment_transform = sentiment_transform
        self.load_data()
        
    def load_data(self):
        self.sentiment = np.loadtxt(self.sentiment_path, dtype=int)
        if self.text is not None:
            with open(self.text_path, 'r') as f:
                self.text = f.readlines()
            self.text = [x.strip() for x in self.text]
        with open(self.image_index_path, 'r') as f:
            data = f.readlines()
        self.image_index = [list(map(int, x[1:-2].split(','))) for x in data]

    def __getitem__(self, index):
        image = None
        text = None
        sentiment = self.sentiment[index]
        if self.image is not None:
            imag_path = os.path.join(self.image_path, str(index)+'.jpg')
            image = read_image(imag_path)
            if self.image_transform:
                image = self.image_transform(image)
        if self.text is not None:
            text = self.text[index]
            if self.text_transform:
                text = self.text_transform(text)
        if self.sentiment_transform:
            sentiment = self.sentiment_transform(sentiment)
        if text is not None and image is not None:
            return image, text, sentiment
        elif text is not None:
            return text, sentiment
        elif image is not None:
            return image, sentiment
        else:
            raise Exception('Either image or text should be not None')

    def __len__(self):
        return len(self.sentiment)

In [62]:
MSCTD_train = MSCTD(root='./data', split='train', has_data={'image': False, 'text': True})
MSCTD_dev = MSCTD(root='./data', split='dev', has_data={'image': False, 'text': True})
MSCTD_test = MSCTD(root='./data', split='test', has_data={'image': False, 'text': True})

In [73]:
def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

class TextPreprocessor:
    def __init__(self, dataset):
        self.dataset = dataset
        # preprocess text
        self.dataset.text = [self.preprocess_text(text) for text in self.dataset.text]

    def preprocess_text(self, text):
            # Replace contractions with original text
        text = re.sub(r'(\b[Aa]in\'t\b)', "am not", text)
        text = re.sub(r'(\b[Hh]a\'t\b)', "has", text)
        text = re.sub(r'(\b[Ii]\'m\b)', "i am", text)
        # convert 's to is
        text = re.sub(r'(\b\'s\b)', " is", text)
        text = re.sub(r'(\b\'re\b)', " are", text)
        text = re.sub(r'(\b\'ve\b)', " have", text)
        text = re.sub(r'(\b\'d\b)', " would", text)
        text = re.sub(r'(\b\'ll\b)', " will", text)
        text = re.sub(r'(\b[Ss]han\'t\b)', "shall not", text)
        text = re.sub(r'(\b[Ww]on\'t\b)', "will not", text)
        text = re.sub(r'(\b[Ww]ouldn\'t\b)', "would not", text)
        text = re.sub(r'(\b[Dd]on\'t\b)', "do not", text)
        text = re.sub(r'(\b[Cc]an\'t\b)', "can not", text)
        text = re.sub(r'(\b[Ii]s\'nt\b)', "is not", text)
        text = re.sub(r'(\b[Ww]eren\'t\b)', "were not", text)
        text = re.sub(r'(\b[Hh]aven\'t\b)', "have not", text)
        text = re.sub(r'(\b[Hh]adn\'t\b)', "had not", text)
        text = re.sub(r'(\b[Hh]asn\'t\b)', "has not", text)
        text = re.sub(r'(\b[Hh]adn\'t\b)', "had not", text)
        text = re.sub(r'(\b[Dd]idn\'t\b)', "did not", text)

        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))

        # Convert to lowercase
        text = text.lower()

        # Tokenize text
        tokens = word_tokenize(text)

        # Remove stop words
        tokens = [token for token in tokens if token not in stop_words]

        # Lemmatize tokens
        tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

        return tokens

    def __getitem__(self, index):
        # Get the caption and sentiment from the dataset
        caption, sentiment = self.dataset[index]

        # Preprocess the caption
        # preprocessed_caption = self.preprocess_text(caption)
        
        return caption, sentiment

# Test the TextPreprocessor
text_preprocessor = TextPreprocessor(MSCTD_train)

In [76]:
i = 2000
print(MSCTD_train[i], text_preprocessor[i])

("I mean, that's not what I want.", 1) (['mean', 'want'], 1)


In [80]:
# import counter
from collections import Counter

In [85]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = Counter()
        self.total_words = 0

    def fit(self, texts):
        for text in texts:
            self.counter.update(text.split())
        
        for word, count in self.counter.items():
            self.word2idx[word] = len(self.idx2word)
            self.idx2word.append(word)
            self.total_words += count

    def transform(self, texts):
        text_indices = []
        for text in texts:
            text_indices.append([self.word2idx[word] for word in text.split() if word in self.word2idx])
        return text_indices

# Fit the vocabulary
vocabulary = Vocabulary()
vocabulary.fit([text for text, _ in text_preprocessor])

KeyboardInterrupt: 