In [18]:
import pandas as pd
import torch
import torch.nn as nn
from torchtext.data import get_tokenizer

def get_tabular(path):
    data = pd.read_excel(path)
    references = data['CodiceArticoloColore']
    references = modify_ref(references)
    description_embedding = word_embedding(data['DescrizioneArticolo'])
    data = data.drop(['CodiceArticolo', 'CodiceArticoloColore', 'DescrizioneArticolo', 'DescrizioneColore', 'WaveDescription'], axis='columns')
    
    encoded_labels, _ = pd.factorize(data['Colore'])
    data['Colore'] = encoded_labels
    encoded_labels, _ = pd.factorize(data['PianoTaglia'])
    data['PianoTaglia'] = encoded_labels
    encoded_labels, _ = pd.factorize(data['WaveCode'])
    data['WaveCode'] = encoded_labels 
    encoded_labels, _ = pd.factorize(data['AstronomicalSeasonDescription'])
    data['AstronomicalSeasonDescription'] = encoded_labels
    
    return data, references, description_embedding
    
def modify_ref(ref):
    for i in range(len(ref)):
        ref[i] = ref[i].replace('-', '_')
        ref[i] = ref[i][:7] + '_' + ref[i][7:]
        ref[i] = ref[i][:13] + '_' + ref[i][13:]
        ref[i] = ref[i] + '_'
        
    return list(ref)

def get_dictionary(descriptions):
    total_text = ""
    for d in descriptions:
        
        ###
        if isinstance(d, int):
            d = "Number"
        ###
        
        total_text = total_text + " " + d
    
    tokenizer = get_tokenizer('basic_english')
    tokens = tokenizer(total_text)
    tokens_set = set(tokens)
    dictionary = {word: j for j, word in enumerate(tokens_set)}
    return dictionary

def word_embedding(descriptions):
    dictionary = get_dictionary(descriptions)
    tokenizer = get_tokenizer('basic_english')
    embed_layer = nn.Embedding(len(dictionary), 10)  

    tensor_list = []
    for d in descriptions:
        
        ###
        if isinstance(d, int):
            d = "Number"
        ###
        
        tokens = tokenizer(d)
        desc_tensor = torch.zeros(1,10)
        for t in tokens:
            lookup_tensor = torch.tensor([dictionary[t]], dtype=torch.long)
            embed = embed_layer(lookup_tensor)
            desc_tensor = torch.cat((desc_tensor, embed), 0)
        desc_tensor = desc_tensor[1:]
        tensor_list.append(desc_tensor)
        
    return tensor_list
    
data, ref, des = get_tabular('D:\\ORS\\Data\\Mauli_articoli_222_231_232.xlsx')


   Colore  PianoTaglia  WaveCode  AstronomicalSeasonDescription
0       0            0         0                              0
1       1            0         0                              0
2       2            0         0                              0
['232_969_89991_02_66D_', '232_969_89991_01_91Z_', '232_969_89991_00_51J_']
[tensor([[ 7.4109e-01, -1.5878e+00,  4.5847e-01,  2.6090e+00,  1.7160e-02,
         -6.6623e-01,  6.4060e-01, -1.4791e+00, -1.1569e+00, -7.1213e-01],
        [ 4.6582e-01, -8.9483e-04,  1.3262e+00, -2.5383e-01,  2.9042e-01,
          1.3891e+00,  8.0242e-01, -1.1463e+00,  7.1280e-01, -9.1239e-01],
        [-2.3944e-01,  6.7145e-01,  4.5311e-01, -1.7739e+00, -3.9314e-01,
         -5.2247e-01,  1.6690e-01,  1.0283e-01, -9.6974e-01,  7.1355e-01],
        [ 2.5526e+00, -4.9219e-01,  1.4401e-01, -5.4303e-02,  7.6938e-02,
          6.6969e-01, -1.4439e-01, -5.7299e-01,  5.4054e-01, -1.3974e+00],
        [ 1.6869e+00, -4.8471e-01, -1.0615e+00,  2.7658e-01, -2.7866e-0