In [1]:
import torch
import torch.nn as nn
import string
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import csv
import spacy
from tqdm import tqdm
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
spacy_en = spacy.load("en_core_web_sm")

In [3]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, datalocation):
        self.datalocation = datalocation
        self.data = []
        self.scores = []
        self.vocab = set({'<unk>'})
        self.word2Index = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.index2Word = {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.wordFrequency = dict()
        print("Reading data...")
        self.read_data()
        print("Preprocessing data...")
        self.preprocess()
        print("Handling unknown tokens...")
        self.handle_unkown_token()
        print("TF-IDF...")
        self.tf_idf()
        

    def read_data(self):
        csvreader = csv.reader(open(self.datalocation, 'r'), delimiter='\t')
        for row in csvreader:
            try:
                self.data.append((row[5],row[6]))
                self.scores.append(float(row[4]))
            except:
                continue
        
    def clean_data(self, tokenized_sentence):
        cleaned_sentence = []
        for token in tokenized_sentence:
            if token not in string.punctuation:
                cleaned_sentence.append(token)
    
        return cleaned_sentence
    
    def preprocess(self):
        for idx in tqdm(range(len(self.data))):
            s1 = [tok.text for tok in spacy_en.tokenizer(self.data[idx][0].lower())]
            s2 = [tok.text for tok in spacy_en.tokenizer(self.data[idx][1].lower())]
            s1 = self.clean_data(s1)
            s2 = self.clean_data(s2)
            for token in s1:
                if token not in self.wordFrequency:
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
        
            for token in s2:
                if token not in self.wordFrequency:
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
            
            self.data[idx] = (s1, s2)

        
    def handle_unkown_token(self):
        self.unkown_token = '<unk>'
        self.word2Index[self.unkown_token] = len(self.word2Index)
        self.index2Word[len(self.index2Word)] = self.unkown_token
        print(self.data[0])

        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for i in range(len(s1)):
                word = s1[i]
                if self.wordFrequency[word] < 2:
                    s1[i] = self.unkown_token
            
            for i in range(len(s2)):
                word = s2[i]
                if self.wordFrequency[word] < 2:
                    s2[i] = self.unkown_token
            
            self.data[idx] = (s1, s2)
        
        print(self.data[0])
        
        self.wordFrequency = dict()
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for token in s1:
                self.vocab.add(token)
                if token not in self.wordFrequency:
                    self.word2Index[token] = len(self.word2Index)
                    self.index2Word[len(self.index2Word)] = token
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
        
            for token in s2:
                self.vocab.add(token)
                if token not in self.wordFrequency:
                    self.word2Index[token] = len(self.word2Index)
                    self.index2Word[len(self.index2Word)] = token
                    self.wordFrequency[token] = 1
                else:
                    self.wordFrequency[token] += 1
            
            s1 = ['<sos>'] + s1 + ['<eos>']
            s2 = ['<sos>'] + s2 + ['<eos>']

            if len(s1) > len(s2):
                s2 = s2 + ['<pad>'] * (len(s1) - len(s2))
            elif len(s1) < len(s2):
                s1 = s1 + ['<pad>'] * (len(s2) - len(s1))
            
            self.data[idx] = (s1, s2)
    
    def tf_idf(self):
        self.df = dict()
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]
            for token in s1:
                if token not in self.df:
                    self.df[token] = 1
                else:
                    self.df[token] += 1
            for token in s2:
                if token not in self.df:
                    self.df[token] = 1
                else:
                    self.df[token] += 1
        
        self.idf = dict()
        for token in self.df:
            self.idf[token] = math.log(len(self.data) / self.df[token])
        
        self.tf_idf_data = []
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]
            tf_idf_s1 = np.zeros(len(self.word2Index))
            tf_idf_s2 = np.zeros(len(self.word2Index))

            for token in s1:
                tf_idf_s1[self.word2Index[token]] = self.idf[token]
            for token in s2:
                tf_idf_s2[self.word2Index[token]] = self.idf[token]
            
            self.tf_idf_data.append((tf_idf_s1, tf_idf_s2))
            
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.tf_idf_data[idx]

In [4]:
data = MyDataset('../stsbenchmark/sts-train.csv')

Reading data...
Preprocessing data...


100%|██████████| 5708/5708 [00:01<00:00, 5499.67it/s]


Handling unknown tokens...
(['a', 'plane', 'is', 'taking', 'off'], ['an', 'air', 'plane', 'is', 'taking', 'off'])


100%|██████████| 5708/5708 [00:00<00:00, 264849.68it/s]


(['a', 'plane', 'is', 'taking', 'off'], ['an', 'air', 'plane', 'is', 'taking', 'off'])


100%|██████████| 5708/5708 [00:00<00:00, 120054.80it/s]


TF-IDF...


100%|██████████| 5708/5708 [00:00<00:00, 162989.83it/s]
100%|██████████| 5708/5708 [00:00<00:00, 17163.83it/s]


In [5]:
def cosine_distance(x1, x2):
    return -1 * cosine_similarity(x1, x2)

In [6]:
sentencesL = []
sentencesR = []
scores = []

for i in tqdm(range(len(data))):
    sentencesL.append(np.array(data[i][0]))
    sentencesR.append(np.array(data[i][1]))

sentencesL = np.array(sentencesL)
sentencesR = np.array(sentencesR)

scores = cosine_similarity(sentencesL, sentencesR)

100%|██████████| 5708/5708 [00:00<00:00, 14004.35it/s]


In [7]:
scores_needed = []
for i in range(len(data)):
    scores_needed.append(scores[i][i] * 5)

In [8]:
from scipy.stats import pearsonr

In [10]:
pearsonr(data.scores, scores_needed)

(0.6946474539889476, 0.0)