In [96]:
import pandas as pd
import numpy as np
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet as wn, cmudict

import gensim
from gensim.models import Word2Vec

import re
import math
from tqdm import tqdm

import warnings
warnings.filterwarnings(action = 'ignore')

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('cmudict')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [106]:
# Read the dataset
dataset = pd.read_csv(r"/content/drive/MyDrive/CSC791:NLP/project/dataset.csv")
# Separate the X(jokes) and the Y(is_humor or not)

jokes = list(dataset['text'])
labels = list(dataset['humor'])


In [109]:
def clean_data(jokes):
    """
    """
    PATTERN = r'[^A-Za-z0-9\s]'

    words_list = []
    for joke in jokes:
        joke = joke.lower()
        processed_joke = re.sub(PATTERN, '', joke)
        words = processed_joke.split(' ')
        words_list.append(words)

    return words_list


In [110]:
words_list = clean_data(jokes)
len(words_list)


200000

In [113]:
cbow = Word2Vec(words_list, min_count = 1,
                              vector_size = 10, window = 5)


In [None]:
skip_gram = Word2Vec(words_list, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)


### Incongruity

In [None]:
max_threshold = 1  # Set maximum threshold as 1 because distance of word from itself will be 1
disconnection_list = []
repetition_list = []
for sentence in tqdm(words_list, desc="Processing Sentences"):
    sentence_word_distances = []
    for i in range(len(sentence)):
        for j in range(i + 1, len(sentence)):
            distance = cbow.wv.similarity(sentence[i], sentence[j])
            if distance < max_threshold:
                sentence_word_distances.append(distance)
    # Check if the sentence_word_distances list is not empty before calculating disconnection and repetition
    if sentence_word_distances:
        disconnection = max(sentence_word_distances)
        repetition = min(sentence_word_distances)
    else:
        disconnection = None
        repetition = None
    disconnection_list.append(disconnection)
    repetition_list.append(repetition)


Processing Sentences: 100%|██████████| 200000/200000 [04:41<00:00, 710.13it/s]


In [114]:
def incongruity(word2vec_model, words_list):
  max_threshold = 1  # Set maximum threshold as 1 because distance of word from itself will be 1
  disconnection_list = []
  repetition_list = []
  for sentence in tqdm(words_list, desc="Processing Sentences"):
      sentence_word_distances = []
      for i in range(len(sentence)):
          for j in range(i + 1, len(sentence)):
              distance = word2vec_model.wv.similarity(sentence[i], sentence[j])
              if distance < max_threshold:
                  sentence_word_distances.append(distance)
      # Check if the sentence_word_distances list is not empty before calculating disconnection and repetition
      if sentence_word_distances:
          disconnection = max(sentence_word_distances)
          repetition = min(sentence_word_distances)
      else:
          disconnection = None
          repetition = None
      disconnection_list.append(disconnection)
      repetition_list.append(repetition)
  return disconnection_list, repetition_list


In [115]:
disconnection_list, repetition_list = incongruity(cbow, words_list)


Processing Sentences: 100%|██████████| 200000/200000 [05:15<00:00, 634.91it/s]


In [103]:
incongruity_df = pd.DataFrame({'Disconnection': disconnection_list,
    'Repetition': repetition_list
})


##### Normalize us option

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split(incongruity_df, labels, test_size=0.7)
model = DecisionTreeClassifier()
model.fit(train_data_X, train_data_Y)
model.score(train_data_X, train_data_Y)


0.9959833333333333

In [105]:
preds = model.predict(test_data_X)
print(classification_report(preds, test_data_Y))


              precision    recall  f1-score   support

       False       0.63      0.64      0.64     68894
        True       0.65      0.64      0.64     71106

    accuracy                           0.64    140000
   macro avg       0.64      0.64      0.64    140000
weighted avg       0.64      0.64      0.64    140000



### Ambiguity
#### 3 Features: Sense combination, closest path similarity, farmos path similarity

In [15]:
# Get POS TAGS for all sentences in words_list
tagged_sentences = []
for sentence in tqdm(words_list):
  tagged_sentence = pos_tag(sentence)
  tagged_sentences.append(tagged_sentence)


100%|██████████| 200000/200000 [03:59<00:00, 836.37it/s]


In [65]:
# Separate lists of POS and append in respective lists
# Noun -> NN/NNS, Verb -> VB, Adjective -> JJ, Adverb -> RP/RB, Numeric -> CD, Determiner -> DT | however D and C do not work in WordNet
pos_tagged_sentences = []
for tagged_words in tqdm(tagged_sentences):
  pos_words = {'NOUN': [], 'VERB': [], 'ADJ': [], 'ADV': [], 'DET': [], 'NUM': []}
  for word, pos in tagged_words:
    if pos.startswith('N'):
        pos_words['NOUN'].append(word)
    elif pos.startswith('V'):
        pos_words['VERB'].append(word)
    elif pos.startswith('J'):
        pos_words['ADJ'].append(word)
    elif pos.startswith('R'):
        pos_words['ADV'].append(word)
  pos_tagged_sentences.append(pos_words)


100%|██████████| 200000/200000 [00:05<00:00, 38119.33it/s]


In [116]:
# Function to get Sentences along with POS Tags
def get_pos_tagged_sentences(words_list):
    """
    """
    tagged_sentences = []
    pos_tagged_sentences = []

    print(f"Getting POS Tags for each sentence")
    for sentence in tqdm(words_list):
        tagged_sentence = pos_tag(sentence)
        tagged_sentences.append(tagged_sentence)

    print(f"Getting POS Tags Lists each sentence")
    for tagged_words in tqdm(tagged_sentences):
        pos_words = {'NOUN': [], 'VERB': [], 'ADJ': [], 'ADV': [], 'DET': [], 'NUM': []}
        for word, pos in tagged_words:
            if pos.startswith('N'):
                pos_words['NOUN'].append(word)
            elif pos.startswith('V'):
                pos_words['VERB'].append(word)
            elif pos.startswith('J'):
                pos_words['ADJ'].append(word)
            elif pos.startswith('R'):
                pos_words['ADV'].append(word)
        pos_tagged_sentences.append(pos_words)

    return pos_tagged_sentences


In [117]:
pos_tagged_sentences = get_pos_tagged_sentences(words_list)


100%|██████████| 200000/200000 [03:30<00:00, 951.45it/s] 
100%|██████████| 200000/200000 [00:04<00:00, 48988.35it/s]


In [71]:
sense_combination = 0
for pos, words in pos_tagged_sentences[2].items():
  for word in words:
    synsets = wn.synsets(word, pos=pos[0].lower())
    if synsets:
      num_senses = len(synsets)
      sense_combination += math.log(num_senses)
      # print(sense_combination)
sense_combination = math.exp(sense_combination)
print(sense_combination)


728.0


In [83]:
# Calculate closest and farmost sense of a given word from WordNet SYNSETS by finding all other senses
path_similarities = []
for words in pos_tagged_sentences[2].values():
  for word in words:
    synsets = wn.synsets(word)
    if synsets:
      # for each sense of same word, find similarity
      for synset in synsets:
        # Compare the similarity of our word sense with other word senses (of same word)
        similarities = [synset.path_similarity(other) for other in synsets if other != synset and other.path_similarity(synset)]
        if similarities:
          path_similarities.extend(similarities)

sense_farmost = max(path_similarities) if path_similarities else None
sense_closest = min(path_similarities) if path_similarities else None


0.3333333333333333
0.043478260869565216


In [68]:
# Get a Sense combination score by the FORMULA in paper using WordNet - SYNSET -> Sense Combination Feature
sense_combination_list = []
for sentence in tqdm(pos_tagged_sentences):
  sense_combination = 0
  for pos, words in sentence.items():
    for word in words:
      synsets = wn.synsets(word, pos=pos[0].lower())
      if synsets:
        num_senses = len(synsets)
        sense_combination += math.log(num_senses) ##################### CHECK FORMULA ###################
  sense_combination = math.exp(sense_combination)
  sense_combination_list.append(sense_combination)


100%|██████████| 200000/200000 [00:21<00:00, 9298.80it/s] 


In [94]:
# Get Sense Farmost and Closest Path Similarity
sense_farmost_list = []
sense_closest_list = []

for sentence in tqdm(pos_tagged_sentences):
  path_similarities = []
  for words in sentence.values():
    for word in words:
      synsets = wn.synsets(word)
      if synsets:
        # for each sense of same word, find similarity
        for synset in synsets:
          # Compare the similarity of our word sense with other word senses (of same word)
          similarities = [synset.path_similarity(other) for other in synsets if other != synset and other.path_similarity(synset)]
          if similarities:
            path_similarities.extend(similarities)
  sense_farmost = max(path_similarities) if path_similarities else None
  sense_closest = min(path_similarities) if path_similarities else None
  sense_farmost_list.append(sense_farmost)
  sense_closest_list.append(sense_closest)


 16%|█▌        | 31418/200000 [1:10:59<6:20:56,  7.38it/s]


KeyboardInterrupt: ignored

### Phonetic Style

### For all

In [154]:
d = cmudict.dict()


# Function to get Phonetic representations of word
def get_phonemes(word):
    """
    Get phonetic representation of a word from CMU Pronouncing Dictionary
    """
    return d[word][0] if word in d else None


# Function to get alliteration and rhyme
def detect_alliteration_rhyme(words):
    alliteration_chains = 0
    max_alliteration_chain_length = 0

    rhyme_chains = 0
    max_rhyme_chain_length = 0

    prev_phoneme = None
    alliteration_chain_length = 1
    rhyme_chain_length = 1

    for word in words:
        phonemes = get_phonemes(word)

        if phonemes:
            first_phoneme = phonemes[0]
            last_phoneme = phonemes[-1]

            if prev_phoneme and first_phoneme == prev_phoneme:
                alliteration_chain_length += 1
            else:
                max_alliteration_chain_length = max(max_alliteration_chain_length, alliteration_chain_length)
                if alliteration_chain_length > 1:
                    alliteration_chains += 1
                alliteration_chain_length = 1

            if prev_phoneme and last_phoneme == prev_phoneme:
                rhyme_chain_length += 1
            else:
                max_rhyme_chain_length = max(max_rhyme_chain_length, rhyme_chain_length)
                if rhyme_chain_length > 1:
                    rhyme_chains += 1
                rhyme_chain_length = 1

            prev_phoneme = last_phoneme

    return alliteration_chains, max_alliteration_chain_length, rhyme_chains, max_rhyme_chain_length


# Get Phonetic Style
def phonetic_style(words_list):
  alliteration_list = []
  max_alliteration_list = []
  rhyme_list = []
  max_rhyme_list = []

  for sentence in tqdm(words_list):
    alliteration_chains, max_alliteration_chain_length, rhyme_chains, max_rhyme_chain_length = detect_alliteration_rhyme(sentence)

    alliteration_list.append(alliteration_chains)
    max_alliteration_list.append(max_alliteration_chain_length)
    rhyme_list.append(rhyme_chains)
    max_rhyme_list.append(max_rhyme_chain_length)

  return alliteration_list, max_alliteration_list, rhyme_list, max_rhyme_list


In [155]:
alliteration, max_alliteration, rhyme, max_rhyme = phonetic_style(words_list)


100%|██████████| 200000/200000 [00:05<00:00, 36318.80it/s]


In [156]:
len(rhyme)


200000

In [157]:
# Get unique values using set()
unique_values = list(set(max_rhyme))

# Print the unique values
print("Unique values:", unique_values)


Unique values: [0, 1, 2, 3, 4, 5, 6, 11]


In [158]:
# Get unique values using set()
unique_values = list(set(rhyme))

# Print the unique values
print("Unique values:", unique_values)


Unique values: [0, 1, 2, 3, 4, 5]


In [159]:
# Get unique values using set()
unique_values = list(set(max_alliteration))

# Print the unique values
print("Unique values:", unique_values)


Unique values: [0, 1, 2, 3, 4]


In [160]:
# Get unique values using set()
unique_values = list(set(alliteration))

# Print the unique values
print("Unique values:", unique_values)


Unique values: [0, 1, 2, 3, 4]


In [161]:
# alliteration, max_alliteration, rhyme, max_rhyme
ps_df = pd.DataFrame({'alliteration': alliteration,
    'max_alliteration': max_alliteration,
    'rhyme': rhyme,
    'max_rhyme': max_rhyme
})
ps_df


Unnamed: 0,alliteration,max_alliteration,rhyme,max_rhyme
0,0,1,0,1
1,0,1,0,1
2,0,1,1,2
3,0,1,0,1
4,0,1,0,1
...,...,...,...,...
199995,0,1,0,1
199996,0,1,1,2
199997,1,2,1,2
199998,0,1,0,1


In [162]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split(ps_df, labels, test_size=0.3)
model = DecisionTreeClassifier()
model.fit(train_data_X, train_data_Y)
model.score(train_data_X, train_data_Y)


0.5510333333333334

In [163]:
preds = model.predict(test_data_X)
print(classification_report(preds, test_data_Y))


              precision    recall  f1-score   support

       False       0.67      0.54      0.60     86933
        True       0.43      0.56      0.49     53067

    accuracy                           0.55    140000
   macro avg       0.55      0.55      0.54    140000
weighted avg       0.58      0.55      0.56    140000

