In [None]:
import pandas as pd
import os
import matplotlib as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
import numpy as np

In [None]:
path_train = 'dataset/training_ironita2018_anon_REV_.csv'
path_test =  'dataset/test_gold_ironita2018_anon_REV_.csv'
out_dir = 'data/profiling_input/'

In [None]:
df1 = pd.read_csv(path_train, delimiter = ";")
df2 = pd.read_csv(path_test, delimiter = ";")

In [None]:
#creazione delle nuove features
def count_exclamation(text):
  return text.count('!')

def count_question_mark(text):
  return text.count('?')

def count_dot(text):
  return text.count('.')

def negative_word(text):
  return text.count(' non ')


In [None]:
df1['exclamation_count_df1'] = df1['text'].apply(lambda x: count_exclamation(x))
df2['exclamation_count_df2'] = df2['text'].apply(lambda x: count_exclamation(x))

In [None]:
df1['question_count_df1'] = df1['text'].apply(lambda x: count_question_mark(x))
df2['question_count_df2'] = df2['text'].apply(lambda x: count_question_mark(x))

In [None]:
df1['count_dot_df1'] = df1['text'].apply(lambda x: count_dot(x))
df2['count_dot_df2'] = df2['text'].apply(lambda x: count_dot(x))

In [None]:
df1['negative_word_df1'] = df1['text'].apply(lambda x: negative_word(x))
df2['negative_word_df2'] = df2['text'].apply(lambda x: negative_word(x))

In [None]:
for id, tweet_text, irony, sarcasm in zip(df1['id'], df1['text'], df1['irony'], df1['sarcasm']):
    file_name = f"train{id}_{irony}_{sarcasm}.txt"
    file_path = os.path.join(out_dir, file_name)
    with open(file_path, 'w', encoding="utf-8") as f:
        f.write(tweet_text)

In [None]:
input_path = 'data/profiling_input'

num_files = len(os.listdir(input_path))

print(f"Il numero di file nella cartella '{input_path}' è: {num_files}")

In [None]:
for id, tweet_text, irony, sarcasm in zip(df2['id'], df2['text'], df2['irony'], df2['sarcasm']):
    file_name = f"test{id}_{irony}_{sarcasm}.txt"
    file_path = os.path.join(out_dir, file_name)
    with open(file_path, 'w', encoding = "utf-8") as f:
        f.write(tweet_text)

In [None]:
output_path = 'data/profiling_output/10380.csv'

In [None]:
df3 = pd.read_csv(output_path, delimiter = "\t")
df3

In [None]:
#creazione della colonna 'id', che servirà per effettuare il merging dei vari dataframe
df3['id'] = df3['Filename'].str.extract(r'(\d+)').astype(dtype = np.int64)

In [None]:
df3 = df3.merge(df1[['id', 'count_dot_df1']], on='id', how='left')
df3 = df3.merge(df2[['id', 'count_dot_df2']], on='id', how='left')

df3['dot_count'] = df3['count_dot_df2'].fillna(df3['count_dot_df1'])

df3.drop(['count_dot_df1', 'count_dot_df2'], axis=1, inplace=True)


In [None]:
df3 = df3.merge(df1[['id', 'question_count_df1']], on='id', how='left')
df3 = df3.merge(df2[['id', 'question_count_df2']], on='id', how='left')

df3['question_count'] = df3['question_count_df2'].fillna(df3['question_count_df1'])

df3.drop(['question_count_df1', 'question_count_df2'], axis=1, inplace=True)

In [None]:
df3 = df3.merge(df1[['id', 'exclamation_count_df1']], on='id', how='left')
df3 = df3.merge(df2[['id', 'exclamation_count_df2']], on='id', how='left')

df3['exclamation_count'] = df3['exclamation_count_df2'].fillna(df3['exclamation_count_df1'])

df3.drop(['exclamation_count_df1', 'exclamation_count_df2'], axis=1, inplace=True)

In [None]:
df3 = df3.merge(df1[['id', 'negative_word_df1']], on='id', how='left')
df3 = df3.merge(df2[['id', 'negative_word_df2']], on='id', how='left')

df3['negative_word'] = df3['negative_word_df2'].fillna(df3['negative_word_df1'])

df3.drop(['negative_word_df1', 'negative_word_df2'], axis=1, inplace=True)

In [None]:
df3.drop('id', axis=1, inplace=True)

In [None]:
#creazione di un'ulteriore colonna, che rappresenta la somma cumulata delle quattro features aggiunte
last_three_columns = df3.iloc[:, -4:]

df3['custom_features'] = last_three_columns.sum(axis=1)

print(df3)

In [None]:
#eliminazione delle colonne con un numero di zeri superiore al 90% 
num_rows = len(df3)

threshold = num_rows * 0.90

zero_counts = (df3 == 0).sum()

selected_columns = zero_counts[zero_counts < threshold].index

df3 = df3[selected_columns]

print(df3)

In [None]:
dataset = []

for index, row in df3.iterrows():
    row_values = row.values.tolist()
    dataset.append(row_values)

print(dataset[0])

In [None]:
header = df3.columns
feature_names = df3.columns[1:].tolist()
print(len(feature_names))

In [None]:
len(feature_names)

In [None]:
def split_train_test(dataset):
    train_dataset = []
    test_dataset = []
    for user_list in dataset:
        file_name = user_list[0]
        if 'train' in file_name:
            train_dataset.append(user_list)
        else:
            test_dataset.append(user_list)
    return train_dataset, test_dataset

In [None]:
train_dataset, test_dataset = split_train_test(dataset)

print('User training set:', len(train_dataset))
print('User test set:', len(test_dataset))

In [None]:
user_0 = test_dataset[0]

file_name_0 = user_0[0]
print('File name:', file_name_0)

file_name_0 = file_name_0[0:-len('.conllu')]
print('File name senza estensione:', file_name_0)

splitted_file_name = file_name_0.split('_')
print('Split su _:', splitted_file_name)

label = splitted_file_name[1]
print('Label:', label)

In [None]:
def create_label_train(dataset):
    labels_irony = []
    labels_sarcasm = []
    for user_list in dataset:

        file_name = user_list[0]
        file_name = file_name[:-len('.conllu')]
        splitted_file_name = file_name.split('_')

        irony = splitted_file_name[2]
        sarcasm = splitted_file_name[3]

        labels_irony.append(irony)
        labels_sarcasm.append(sarcasm)
    return labels_irony, labels_sarcasm

def create_label_test(dataset):
    labels_irony = []
    labels_sarcasm = []
    for user_list in dataset:

        file_name = user_list[0]
        file_name = file_name[:-len('.conllu')]
        splitted_file_name = file_name.split('_')

        irony = splitted_file_name[2]
        sarcasm = splitted_file_name[3]

        labels_irony.append(irony)
        labels_sarcasm.append(sarcasm)
    return labels_irony, labels_sarcasm


In [None]:
train_labels_irony, train_labels_sarcasm = create_label_train(train_dataset)
test_labels_irony, test_labels_sarcasm = create_label_test(test_dataset)

In [None]:
test_labels_irony

In [None]:
for doc in train_dataset:
    doc.pop(0)

for doc in test_dataset:
    doc.pop(0)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_dataset)

In [None]:
from sklearn.svm import LinearSVC

In [None]:
svc = LinearSVC(dual=False)
svc.fit(X_train, train_labels_irony)

In [None]:
train_predictions = svc.predict(X_train)
train_predictions

In [None]:
print(classification_report(train_labels_irony, train_predictions)) # output_dict=True

In [None]:
ConfusionMatrixDisplay.from_predictions(train_labels_irony, train_predictions, xticks_rotation='vertical', cmap='Blues');

In [None]:
y_train = np.asarray(train_labels_irony)

In [None]:
splitter = KFold(n_splits=5, random_state=42, shuffle=True)
folds = list(splitter.split(X_train))

for i in range(len(folds)):
    print(len(folds[i][0]), len(folds[i][1]))

In [None]:
from sklearn.dummy import DummyClassifier

all_y_true = []
all_y_pred = []

for i in range(len(folds)):
    train_ids = folds[i][0]
    test_ids = folds[i][1]


    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]

    fold_X_test = X_train[test_ids]
    fold_y_test = y_train[test_ids]

    kfold_svc = LinearSVC(dual=False)
    kfold_svc.fit(fold_X_train, fold_y_train)
    fold_y_pred = kfold_svc.predict(fold_X_test)
    fold_accuracy = accuracy_score(fold_y_test, fold_y_pred)

    dummy_clf = DummyClassifier(strategy="most_frequent")   
    dummy_clf.fit(fold_X_train, fold_y_train)
    dummy_score = dummy_clf.score(fold_X_test, fold_y_test)

    all_y_true += fold_y_test.tolist()
    all_y_pred += fold_y_pred.tolist()
    print(f"Accuracy fold {i+1}: {fold_accuracy}, baseline: {dummy_score}")


In [None]:
print(classification_report(all_y_true, all_y_pred, zero_division=0))

In [None]:
ConfusionMatrixDisplay.from_predictions(all_y_true, all_y_pred, xticks_rotation='vertical', cmap='Blues');

In [None]:
X_test = scaler.fit_transform(test_dataset)

In [None]:
test_predictions = svc.predict(X_test)
print(classification_report(test_labels_irony, test_predictions, zero_division=0))

In [None]:
baseline_classifier = DummyClassifier(strategy="most_frequent")
baseline_classifier.fit(X_train, y_train)
baseline_test_predictions = baseline_classifier.predict(X_test)
print(classification_report(test_labels_irony, baseline_test_predictions, zero_division=0))

In [None]:
coefs = svc.coef_ 
coefs.shape

In [None]:
import matplotlib.pyplot as plt
idx = 0
class_coefs = coefs[idx]

feature_importances = {feature_name: coef for feature_name, coef in zip(feature_names, class_coefs)}
sorted_feature_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))


num_to_plot = 20
print(f'Feature importance classe {svc.classes_[idx]}')
plt.barh(range(num_to_plot), list(sorted_feature_importances.values())[:num_to_plot], align='center')
plt.yticks(range(num_to_plot), list(sorted_feature_importances.keys())[:num_to_plot])
plt.show()