In [None]:
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, accuracy_score



In [None]:
conllu_dir = 'data/output_conllu/'

In [None]:
def load_stopwords(file_path):
    stopwords = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            word = line.strip()
            stopwords.append(word)
    return stopwords

lista_stopwords = load_stopwords('data/stopwords-it.txt')

In [None]:
lista_stopwords

In [None]:
train_files = []
test_files = []

for file_name in os.listdir(conllu_dir):
    print(file_name)
    if 'train' in file_name:
        train_files.append(file_name)
    elif 'test' in file_name:
        test_files.append(file_name)

print('Documenti training set:', len(train_files))
print('Documenti test set:', len(test_files))

In [None]:
train_files[0]

In [None]:
def get_sentences_from_file(src_path):
    user_sentences = []                
    sentence = []                      
    for line in open(src_path, 'r', encoding = "utf-8"):
        if line[0].isdigit():          
            splitted_line = line.strip().split('\t')
            if '-' not in splitted_line[0] and splitted_line[1].lower() not in lista_stopwords:  ì
                token = {
                    'word': splitted_line[1],
                    'lemma': splitted_line[2],
                    'pos': splitted_line[3]
                }
                sentence.append(token)
        if line == '\n':  ì
            user_sentences.append(sentence)
            sentence = []
    return user_sentences

In [None]:
train_dataset = []

for user_path in train_files:
    user_sentences = get_sentences_from_file('data/output_conllu/' + user_path)
    train_dataset.append(user_sentences)

len(train_dataset)

In [None]:
#estrae gli n-grammi di parole dalla frase data
def extract_word_ngrams(word_ngrams, sentence, el, n):
    
    all_words = []
    for token in sentence:
        all_words.append(token[el])
    
    for i in range(0, len(all_words) - n + 1):
        ngram_words = all_words[i: i + n]
        ngram = f'{el.upper()}_{n}_' + '_'.join(ngram_words)
        if ngram not in word_ngrams:
            word_ngrams[ngram] = 1 #aggiorna il dizionario con l'n-gramma e la sua frequenza (se non esiste già)
        else:
            word_ngrams[ngram] += 1 #incrementa la frequenza dell'n-gramma nel dizionario
    
    return word_ngrams

In [None]:
#estrae gli n-grammi di caratteri dalla frase data
def extract_char_ngrams(char_ngrams, sentence, n):
    
    all_words = []
    for token in sentence:
        all_words.append(token['word'])

    all_words = ' '.join(all_words)
    
    for i in range(0, len(all_words) - n + 1):
        ngram_chars = all_words[i:i + n]
        ngram = f'CHAR_{n}_' + ngram_chars

        if ngram not in char_ngrams:
            char_ngrams[ngram] = 1
        else:
            char_ngrams[ngram] += 1
    
    return char_ngrams

In [None]:
import math

#conta il numero totale di parole nel documento.

def count_document_words(document):
    num_words = 0
    for sentence in document:
        num_words = num_words + len(sentence)
    return num_words

#conta il numero totale di caratteri nel documento.

def count_document_chars(document):
    num_chars = 0
    for sentence in document:
        for token in sentence:
            num_chars = num_chars + len(token['word'])
        num_chars = num_chars + len(sentence) - 1  
    return num_chars

#normalizza i conteggi degli n-grammi nel dizionario ngrams_dict dividendo per la lunghezza totale del documento

def normalize_ngrams(ngrams_dict, doc_len):
    for ngram in ngrams_dict:
        ngrams_dict[ngram] = ngrams_dict[ngram]/float(doc_len)


In [None]:
for document in train_dataset:
    for frase in document:
        print(frase)

In [None]:
#estrazione delle features basate su n-grammi 
def extract_features(dataset):
    dataset_features = []
    for document in dataset:
        word_unigrams = dict()
        word_bigrams = dict()
        word_trigrams = dict()
        #lemmas_unigrams = dict()
        #pos_unigrams = dict()
        #pos_bigrams= dict()
        #pos_trigrams = dict()
        #lemmas_bigrams = dict()
        char_trigrams = dict()
        char_bigrams = dict()
        #lemmas_trigrams = dict()
        for sentence in document:
            #extract_word_ngrams(pos_unigrams, sentence, 'pos', 1)
            #extract_word_ngrams(pos_bigrams, sentence, 'pos', 2)
            #extract_word_ngrams(pos_trigrams, sentence, 'pos', 3)
            extract_char_ngrams(char_trigrams, sentence, 3)
            extract_char_ngrams(char_bigrams, sentence, 2)
            #extract_word_ngrams(lemmas_bigrams, sentence, 'lemma', 2)
            #extract_word_ngrams(lemmas_trigrams, sentence, 'lemma', 3)
            #extract_word_ngrams(lemmas_unigrams, sentence, 'lemma', 1)
            #extract_char_ngrams(char_fourgrams, sentence, 4)
            extract_word_ngrams(word_unigrams, sentence, 'word', 1)
            extract_word_ngrams(word_bigrams, sentence, 'word', 2)
            extract_word_ngrams(word_trigrams, sentence, 'word', 3)

        num_words = count_document_words(document)
        num_chars = count_document_chars(document)
        #normalize_ngrams(pos_unigrams, num_words)
        #normalize_ngrams(pos_bigrams, num_words)
        #normalize_ngrams(pos_trigrams, num_words)
        normalize_ngrams(char_trigrams, num_chars)
        normalize_ngrams(char_bigrams, num_chars)
        #normalize_ngrams(lemmas_bigrams, num_words)
        #normalize_ngrams(lemmas_trigrams, num_words)
        #normalize_ngrams(lemmas_unigrams, num_words)
        #normalize_ngrams(char_fourgrams, num_chars)
        normalize_ngrams(word_unigrams, num_words)
        normalize_ngrams(word_bigrams, num_words)
        normalize_ngrams(word_trigrams, num_words)
        
        user_features = char_bigrams | char_trigrams | word_unigrams | word_bigrams | word_trigrams
      
        dataset_features.append(user_features)
    return dataset_features

In [None]:
train_features = extract_features(train_dataset)


In [None]:
train_features[:10]

In [None]:
def get_num_features(dataset):
    all_features = set()
    for user_dict in dataset:
        all_features.update(list(user_dict.keys()))
    return len(all_features)

In [None]:
print(f'Numero features: {get_num_features(train_features)}')

In [None]:
#filtra le feature nel training set rispetto a un'occorrenza minima 
def filter_features(train_features, min_occurrences):
    
    features_counter = dict()
    for user_dict in train_features:
        for feature in user_dict:
            if feature in features_counter:
                features_counter[feature] += 1
            else:
                features_counter[feature] = 1

    for user_dict in train_features:
        user_features = list(user_dict.keys())
        for feature in user_features:
            if features_counter[feature] < min_occurrences:
                user_dict.pop(feature)
                
    return train_features

In [None]:
train_features = extract_features(train_dataset)
train_features = filter_features(train_features, 5)
print(f'Numero features dopo il filtro: {get_num_features(train_features)}')

In [None]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)

In [None]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
def create_label_train(dataset):
    labels_irony = []
    labels_sarcasm = []
    for user_list in dataset:
    
        #file_name = user_list[0]
        file_name = user_list[:-len('.conllu')]
        splitted_file_name = file_name.split('_')

        irony = splitted_file_name[2]
        sarcasm = splitted_file_name[3]

        labels_irony.append(irony)
        labels_sarcasm.append(sarcasm)
    return labels_irony, labels_sarcasm

def create_label_test(dataset):
    labels_irony = []
    labels_sarcasm = []
    for user_list in dataset:

        file_name = user_list[:-len('.conllu')]
        splitted_file_name = file_name.split('_')

        irony = splitted_file_name[2]
        sarcasm = splitted_file_name[3]

        labels_irony.append(irony)
        labels_sarcasm.append(sarcasm)
    return labels_irony, labels_sarcasm


In [None]:
train_labels_irony, train_labels_sarcasm = create_label_train(train_files)
test_labels_irony, test_labels_sarcasm = create_label_test(test_files)

In [None]:
train_labels_irony

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(dual=True, max_iter=10000)
svc.fit(X_train, train_labels_irony)

In [None]:
from sklearn.metrics import classification_report

train_predictions = svc.predict(X_train)
print(classification_report(train_labels_irony, train_predictions, zero_division=0))

In [None]:
y_train = np.asarray(train_labels_irony)

In [None]:
splitter = KFold(n_splits=5, random_state=42, shuffle=True)
folds = list(splitter.split(X_train))

for i in range(len(folds)):
    print(len(folds[i][0]), len(folds[i][1]))

In [None]:
from sklearn.dummy import DummyClassifier

all_y_true = []
all_y_pred = []
fold_accuracies = []

for i in range(len(folds)):
    train_ids = folds[i][0]
    test_ids = folds[i][1]

    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]

    fold_X_test = X_train[test_ids]
    fold_y_test = y_train[test_ids]

    kfold_svc = LinearSVC(dual=False)
    kfold_svc.fit(fold_X_train, fold_y_train)
    fold_y_pred = kfold_svc.predict(fold_X_test)
    fold_accuracy = accuracy_score(fold_y_test, fold_y_pred)
    fold_accuracies.append(fold_accuracy)

    dummy_clf = DummyClassifier(strategy="most_frequent")   # Dummy classifier viene utilizzato per avere una baseline
    dummy_clf.fit(fold_X_train, fold_y_train)
    dummy_score = dummy_clf.score(fold_X_test, fold_y_test)

    all_y_true += fold_y_test.tolist()
    all_y_pred += fold_y_pred.tolist()
    print(f"Accuracy fold {i+1}: {fold_accuracy}, baseline: {dummy_score}")

# Calcola l'accuracy media sui 5 fold
mean_accuracy = sum(fold_accuracies) / len(fold_accuracies)
print(f"Accuracy media sui 5 fold: {mean_accuracy}")

In [None]:
test_dataset = []

for file_path in test_files:
    doc_sentences = get_sentences_from_file('data/output_conllu/' + file_path)
    test_dataset.append(doc_sentences)

test_labels_irony, test_labels_sarcasm = create_label_test(test_files)
test_features = extract_features(test_dataset)

X_test = vectorizer.transform(test_features)
X_test = scaler.transform(X_test)

In [None]:
test_labels_irony

In [None]:
test_predictions = svc.predict(X_test)
print(classification_report(test_labels_irony, test_predictions,  zero_division=0))

In [None]:
coefs = svc.coef_ 
coefs.shape

In [None]:
X_train

In [None]:
features_names = vectorizer.get_feature_names_out(X_train).tolist()
features_names;

In [None]:
import matplotlib.pyplot as plt

idx = 0
class_coefs = coefs[idx]

feature_importances = {feature_name: coef for feature_name, coef in zip(features_names, class_coefs)}
sorted_feature_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))


num_to_plot = 15
print(f'Feature importance classe {svc.classes_[idx]}')
plt.barh(range(num_to_plot), list(sorted_feature_importances.values())[:num_to_plot], align='center')
plt.yticks(range(num_to_plot), list(sorted_feature_importances.keys())[:num_to_plot])
plt.gca().invert_yaxis()
plt.show()