In [4]:
import string
import nltk 
import torch
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import numpy as np
import numpy as np
import csv
import spacy
from tqdm import tqdm
from collections import Counter
import math
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
spacy_en = spacy.load("en_core_web_sm")
spacy_es = spacy.load("es_core_news_sm")

In [11]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X_file, Y_file):
        self.X_file = X_file
        self.Y_file = Y_file
        self.data = []
        self.scores = []
        self.ENvocab = set({'<unk>'})
        self.ENword2Index = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.ENindex2Word = {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.ENwordFrequency = dict()
        self.ESvocab = set({'<unk>'})
        self.ESword2Index = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.ESindex2Word = {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.ESwordFrequency = dict()
        print("Reading data...")
        self.read_data()
        print("Preprocessing data...")
        self.preprocess()
        print("Handling unknown tokens...")
        self.handle_unkown_token()
        print("TF-IDF...")
        self.tf_idf()
        

    def read_data(self):
        with open(self.X_file, 'r') as inFile:
            data = inFile.readlines()
            for line in data:
                sentences = line.split('\t')
                self.data.append((sentences[0], sentences[1]))
        
        with open(self.Y_file, 'r') as inFile:
            data = inFile.readlines()
            for line in data:
                self.scores.append(int(line))
        
    def clean_data(self, tokenized_sentence):
        cleaned_sentence = []
        for token in tokenized_sentence:
            if token not in string.punctuation:
                cleaned_sentence.append(token)
    
        return cleaned_sentence
    
    def preprocess(self):
        for idx in tqdm(range(len(self.data))):
            s1 = [tok.text for tok in spacy_en.tokenizer(self.data[idx][0].lower())]
            s2 = [tok.text for tok in spacy_es.tokenizer(self.data[idx][1].lower())]
            s1 = self.clean_data(s1)
            s2 = self.clean_data(s2)
            for token in s1:
                if token not in self.ENwordFrequency:
                    self.ENwordFrequency[token] = 1
                else:
                    self.ENwordFrequency[token] += 1
        
            for token in s2:
                if token not in self.ESwordFrequency:
                    self.ESwordFrequency[token] = 1
                else:
                    self.ESwordFrequency[token] += 1
            
            self.data[idx] = (s1, s2)

        
    def handle_unkown_token(self):
        self.unkown_token = '<unk>'
        self.ENword2Index[self.unkown_token] = len(self.ENword2Index)
        self.ENindex2Word[len(self.ENindex2Word)] = self.unkown_token

        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for i in range(len(s1)):
                word = s1[i]
                if self.ENwordFrequency[word] < 2:
                    s1[i] = self.unkown_token
            
            for i in range(len(s2)):
                word = s2[i]
                if self.ESwordFrequency[word] < 2:
                    s2[i] = self.unkown_token
            
            self.data[idx] = (s1, s2)
        
        self.ENwordFrequency = dict()
        self.ESwordFrequency = dict()
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]

            for token in s1:
                self.ENvocab.add(token)
                if token not in self.ENwordFrequency:
                    self.ENword2Index[token] = len(self.ENword2Index)
                    self.ENindex2Word[len(self.ENindex2Word)] = token
                    self.ENwordFrequency[token] = 1
                else:
                    self.ENwordFrequency[token] += 1
        
            for token in s2:
                self.ESvocab.add(token)
                if token not in self.ESwordFrequency:
                    self.ESword2Index[token] = len(self.ESword2Index)
                    self.ESindex2Word[len(self.ESindex2Word)] = token
                    self.ESwordFrequency[token] = 1
                else:
                    self.ESwordFrequency[token] += 1
            
            s1 = ['<sos>'] + s1 + ['<eos>']
            s2 = ['<sos>'] + s2 + ['<eos>']

            if len(s1) > len(s2):
                s2 = s2 + ['<pad>'] * (len(s1) - len(s2))
            elif len(s1) < len(s2):
                s1 = s1 + ['<pad>'] * (len(s2) - len(s1))
            
            self.data[idx] = (s1, s2)
    
    def tf_idf(self):
        self.ENtf = dict()
        self.EStf = dict()
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]
            for token in s1:
                if token not in self.ENtf:
                    self.ENtf[token] = 1
                else:
                    self.ENtf[token] += 1
            for token in s2:
                if token not in self.EStf:
                    self.EStf[token] = 1
                else:
                    self.EStf[token] += 1
        
        self.ENidf = dict()
        self.ESidf = dict()
        for token in self.ENtf:
            self.ENidf[token] = np.log(len(self.data) / self.ENtf[token])
        for token in self.EStf:
            self.ESidf[token] = np.log(len(self.data) / self.EStf[token])
        
        self.tf_idf_data = []
        for idx in tqdm(range(len(self.data))):
            s1 = self.data[idx][0]
            s2 = self.data[idx][1]
            
            s1_tf = np.zeros(len(self.ENword2Index))
            s2_tf = np.zeros(len(self.ESword2Index))

            for token in s1:
                s1_tf[self.ENword2Index[token]] = self.ENidf[token]
            
            for token in s2:
                s2_tf[self.ESword2Index[token]] = self.ESidf[token]
            
            self.tf_idf_data.append((s1_tf, s2_tf))

            
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.tf_idf_data[idx]

In [12]:
data = MyDataset('../sts-2017-en-es/En_Es_STS/STS.input.en-es.train.txt', '../sts-2017-en-es/En_Es_STS/STS.input.en-es.train_scores.txt')

Reading data...
Preprocessing data...


100%|██████████| 1000/1000 [00:00<00:00, 8749.16it/s]


Handling unknown tokens...


100%|██████████| 1000/1000 [00:00<00:00, 142184.62it/s]
100%|██████████| 1000/1000 [00:00<00:00, 64507.91it/s]


TF-IDF...


100%|██████████| 1000/1000 [00:00<00:00, 99681.63it/s]
100%|██████████| 1000/1000 [00:00<00:00, 52924.30it/s]


In [14]:
EN_datapoints = list()
ES_datapoints = list()

for datapoints in data:
    EN_datapoints.append(datapoints[0])
    ES_datapoints.append(datapoints[1])

EN_datapoints = np.array(EN_datapoints)
ES_datapoints = np.array(ES_datapoints)

In [15]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=100, n_iter=7, random_state=42)

EN_datapoints = SVD.fit_transform(EN_datapoints)
ES_datapoints = SVD.fit_transform(ES_datapoints)

print(EN_datapoints.shape, ES_datapoints.shape)

(1000, 100) (1000, 100)


In [34]:
scores = cosine_similarity(EN_datapoints, ES_datapoints)

In [35]:
print(scores)

[[ 2.73975468e-01  1.31007404e-01 -5.29191381e-02 ...  2.32895302e-02
  -1.27527987e-04  2.68196235e-01]
 [ 1.56385824e-01  3.90073162e-02  1.61929209e-01 ...  1.59522662e-01
   1.54551996e-01 -1.46604164e-01]
 [-8.79260781e-03 -4.14174265e-02  1.93935119e-01 ...  5.29394210e-02
   4.72425191e-02 -2.89874204e-02]
 ...
 [ 1.68707508e-01  1.16773893e-01  2.39514214e-02 ...  5.64764024e-02
   2.23232674e-01  5.18287400e-02]
 [ 1.93477796e-01 -1.87018781e-02  1.04777169e-01 ...  3.78248934e-02
   9.40088059e-02 -4.76415428e-02]
 [ 1.15611257e-01  1.10393717e-01  1.67091786e-01 ... -5.82096903e-02
  -4.65110682e-02  5.32544009e-01]]


In [62]:
scores_needed = list()

for i in range(len(data)):
    normalized_score = (scores[i][i] + 1)/2 * 6 - 0.5
    scores_needed.append(round(normalized_score))

In [64]:
from scipy.stats import pearsonr

In [66]:
pearsonr(data.scores, scores_needed)

(-0.0677572613808956, 0.032156854609399904)

In [68]:
from sklearn.metrics import accuracy_score , f1_score, recall_score , precision_score, classification_report

print(accuracy_score(data.scores, scores_needed))
print(f1_score(data.scores, scores_needed, average='macro'))
print(recall_score(data.scores, scores_needed, average='macro'))
print(precision_score(data.scores, scores_needed, average='macro'))
print(classification_report(data.scores, scores_needed))

0.228
0.10545618035159933
0.1824099827500182
0.17239523602744963
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        11
           2       0.16      0.14      0.15       161
           3       0.24      0.87      0.38       228
           4       0.13      0.08      0.10        65
           5       0.50      0.00      0.00       534

    accuracy                           0.23      1000
   macro avg       0.17      0.18      0.11      1000
weighted avg       0.36      0.23      0.12      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
