In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data Understanding and Cleaning**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("/kaggle/input/italian-sarcasm-detection/News_Dataset.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
import re
import string
# convert all words in headline into lower cas
df['titolo'] = df.titolo.apply(lambda x:x.lower()) 

# remove all punctuations in headline  
df['titolo'] = df.titolo.apply(lambda x: ' '.join(word.strip(string.punctuation) for word in x.split())) 

In [None]:
# pie chart 
df['sarcastic'].value_counts().plot(kind='pie', 
                                   title='1:Sarcastic / 0:Non sarcastic ITA',
                                   autopct='%1.1f%%',
                                   explode= (0, 0.1))
plt.show()

In [None]:
df.sarcastic.value_counts()

In [None]:
#The data was taken from the following sites
df.link.apply(lambda x: x.split('/')[2]).value_counts()

**Linguistic Analysis**

In [None]:
#REGULAR EXPRESSION
import re

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
stopITA = set(stopwords.words('italian'))

In [None]:
cleaned_titleITA = []

for sentance in df['titolo'].values:
    sentance = str(sentance)
    sentance = re.sub(r'[?|!|\'|"|#|+|$]', r'', sentance) #sostituisce con lo spazio vuoto
    sentance = re.sub("\S*\d\S*", "", sentance).strip() #numeri
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopITA)
    cleaned_titleITA.append(sentance.strip())

In [None]:
df['Titolo_cleaned_re'] = cleaned_titleITA

**Tokenization with NLTK**

In [None]:
import nltk
nltk.download('punkt')

In [None]:
df['titolo_tokenize'] = df.titolo.apply(nltk.word_tokenize)
df.titolo_tokenize.head()

In [None]:
list_tokensITA = []
for t in df['titolo_tokenize']:
  for i in t:
    x = i.lower()
    list_tokensITA.append(x)
print(list_tokensITA[:20])

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
stopWords = set(stopwords.words('italian'))

In [None]:
filtered_tokensITA = []
for w in list_tokensITA:
  if w not in stopWords:
    filtered_tokensITA.append(w)
#print("Tokenized Sentence:",list_tokens)
print("Filtered Sentence:",filtered_tokensITA[:10])

In [None]:
punkt= [',', '.', '!', '?', '...', '-', '…', "'", "’", ':', '"', '$', '\'s', '%', '“','”','«', '»', '``', "''"]

filtered_tokens_nITA= []
for w in filtered_tokensITA:
  if w not in punkt:
    filtered_tokens_nITA.append(w)

In [None]:
#POS Tagging
listPos = nltk.pos_tag(filtered_tokens_nITA)
listPos[:10]

In [None]:
from nltk import Counter
Counter(listPos).most_common(10)

In [None]:
freq_dist_Nostop = nltk.FreqDist(filtered_tokens_nITA)
freq_dist_Nostop.plot(20)

In [None]:
nltk_text = nltk.Text(list_tokensITA)
nltk_text.concordance('coronavirus')

**LEMMATIZATION**

In [None]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

In [None]:
list_lem = []
for t in listPos:
  lem = lmtzr.lemmatize(t[0])  #(t[0], pos = t[1]) 
  list_lem.append(lem)
print(list_lem[:20])


**Latent Dirichlet Allocation (*LDA*)***

In [None]:
import tensorflow as tf
import string
import re
import nltk
import spacy
import sys
from spacy.lang.en import English
import en_core_web_sm
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatize

In [None]:
nlp = en_core_web_sm.load()
nltk.download('stopwords')
parser = English()
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
def tokenize(text):
    """this function is to tokenize the headline into a list of individual words"""
    lda_tokens = []
    tokens = parser(text)  # need to use parser for python to treat the list as words
    for token in tokens:
        if token.orth_.isspace():  # to ignore any whitespaces in the headline, so that token list does not contain whitespaces 
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)   # tokens (headlines) are already in lowercase
    return lda_tokens

In [None]:
def get_lemma(word):
    """this function is to lemmatize the words in a headline into its root form"""
    lemma = wn.morphy(word)  # converts the word into root form from wordnet
    if lemma is None:
        return word
    else:
        return lemma

In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)  # parse and tokenize the headline into a list of words
    tokens = [token for token in tokens if len(token) > 4]  # remove headlines with only length of 4 words or less
    tokens = [token for token in tokens if token not in en_stop]  # remove stopwords in the headline
    tokens = [get_lemma(token) for token in tokens]  # lemmatize the words in the headline
    return tokens

In [None]:
NS=df[df.sarcastic==0]
S=df[df.sarcastic==1]

In [None]:
nltk.download('wordnet')
text_data = []
for headline in df.titolo:
    tokens = prepare_text_for_lda(headline)
    text_data.append(tokens)

In [None]:
Sarcasm_data = []
for headline in S.titolo:
    tokens = prepare_text_for_lda(headline)
    Sarcasm_data.append(tokens)
Not_Sar = []
for headline in NS.titolo:
    tokens = prepare_text_for_lda(headline)
    Not_Sar.append(tokens)

In [None]:
from gensim import corpora
import pickle

dictionary = corpora.Dictionary(text_data)
S_dictionary = corpora.Dictionary(Sarcasm_data)
NS_dictionary = corpora.Dictionary(Not_Sar)# Convert all headlines into a corpus of words, with each word as a token
corpus = [dictionary.doc2bow(text) for text in text_data]
S_corpus = [S_dictionary.doc2bow(text) for text in Sarcasm_data]
NS_corpus = [NS_dictionary.doc2bow(text) for text in Not_Sar]# Convert each headline (a list of words) into the bag-of-words format

In [None]:
print(len(text_data), len(Sarcasm_data), len(Not_Sar))

In [None]:
import gensim

NUM_TOPICS = [3, 5, 10]
# passes: Number of passes through the corpus during training
# alpha: priori on the distribution of the topics in each document.
# The higher the alpha, the higher the likelihood that document contains a wide range of topics, vice versa. 
# beta: priori on the distribution of the words in each topic.
# The higher the beta, the higher the likelihood that topic contains a wide range of words, vice versa.
# we do not alter / fine tune the default values of alpha and beta
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS[1], id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
topics

In [None]:
#LDA on most 5 non sacrcastic topics
ldamodel = gensim.models.ldamodel.LdaModel(NS_corpus, num_topics = NUM_TOPICS[1], id2word=NS_dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
topics

In [None]:
#LDA su most 5 topic news sarcastiche 
ldamodel = gensim.models.ldamodel.LdaModel(S_corpus, num_topics = NUM_TOPICS[1], id2word=S_dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=5)
topics

**CLASSIFICATION**

*BERT*

In [None]:
import tensorflow as tf
import sklearn
import seaborn as sbs


In [None]:
!pip install transformers

In [None]:
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split

In [None]:
labels = df.sarcastic.values
sentences = df.titolo.values

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME,do_lower_case = True)

def encoder(sentences):
  ids = []
  for sentence in sentences:
    encoding = tokenizer.encode_plus(
    sentence,
    max_length=16,
    truncation = True,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=False)
    ids.append(encoding['input_ids'])
  return ids

#Train test split
train_sents,test_sents, train_labels, test_labels  = train_test_split(sentences, labels ,test_size=0.15)

train_ids = encoder(train_sents)
test_ids = encoder(test_sents)

In [None]:
#Train test split
train_sents1,test_sents1, train_labels1, test_labels1  = train_test_split(sentences, labels ,test_size=0.15)

train_ids = encoder(train_sents1)
test_ids = encoder(test_sents1)

In [None]:
train_ids = tf.convert_to_tensor(train_ids)
test_ids = tf.convert_to_tensor(test_ids)
test_labels = tf.convert_to_tensor(test_labels)
train_labels = tf.convert_to_tensor(train_labels)

In [None]:
bert_encoder = TFBertModel.from_pretrained('bert-base-uncased')
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")  
embedding = bert_encoder([input_word_ids])
dense = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(embedding[0])
dense = tf.keras.layers.Dense(128, activation='relu')(dense)
dense = tf.keras.layers.Dropout(0.2)(dense)   
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)    

model = tf.keras.Model(inputs=[input_word_ids], outputs=output)

In [None]:
model.compile(tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x = train_ids, y = train_labels, epochs = 8, verbose = 1, batch_size = 32, validation_data = (test_ids, test_labels))

*The model is clearly overfitting due to the small dimension*

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs/Iterations")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
print("Evaluate on test data")
results = model.evaluate(test_ids, test_labels, batch_size=128)
print("test loss, test acc:", results

*Thanks for watching*