In [1]:
import pandas as pd
import numpy as np
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet as wn, cmudict

import gensim
from gensim.models import Word2Vec

import re
import math
from tqdm import tqdm

import warnings
warnings.filterwarnings(action = 'ignore')

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('cmudict')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [2]:
# Read the dataset
dataset = pd.read_csv(r"/content/drive/MyDrive/CSC791:NLP/project/dataset.csv")
# Separate the X(jokes) and the Y(is_humor or not)

jokes = list(dataset['text'])
labels = list(dataset['humor'])

In [3]:
def clean_data(jokes):
    """
    """
    PATTERN = r'[^A-Za-z0-9\s]'

    words_list = []
    for joke in jokes:
        joke = joke.lower()
        processed_joke = re.sub(PATTERN, '', joke)
        words = processed_joke.split(' ')
        words_list.append(words)

    return words_list

In [4]:
words_list = clean_data(jokes)
len(words_list)

200000

In [5]:
cbow = Word2Vec(words_list, min_count = 1,
                              vector_size = 10, window = 5)

In [None]:
skip_gram = Word2Vec(words_list, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)

### Incongruity

In [None]:
max_threshold = 1  # Set maximum threshold as 1 because distance of word from itself will be 1
disconnection_list = []
repetition_list = []
for sentence in tqdm(words_list, desc="Processing Sentences"):
    sentence_word_distances = []
    for i in range(len(sentence)):
        for j in range(i + 1, len(sentence)):
            distance = cbow.wv.similarity(sentence[i], sentence[j])
            if distance < max_threshold:
                sentence_word_distances.append(distance)
    # Check if the sentence_word_distances list is not empty before calculating disconnection and repetition
    if sentence_word_distances:
        disconnection = max(sentence_word_distances)
        repetition = min(sentence_word_distances)
    else:
        disconnection = None
        repetition = None
    disconnection_list.append(disconnection)
    repetition_list.append(repetition)

Processing Sentences: 100%|██████████| 200000/200000 [04:41<00:00, 710.13it/s]


In [None]:
def incongruity(word2vec_model, words_list):
  max_threshold = 1  # Set maximum threshold as 1 because distance of word from itself will be 1
  disconnection_list = []
  repetition_list = []
  for sentence in tqdm(words_list, desc="Processing Sentences"):
      sentence_word_distances = []
      for i in range(len(sentence)):
          for j in range(i + 1, len(sentence)):
              distance = word2vec_model.wv.similarity(sentence[i], sentence[j])
              if distance < max_threshold:
                  sentence_word_distances.append(distance)
      # Check if the sentence_word_distances list is not empty before calculating disconnection and repetition
      if sentence_word_distances:
          disconnection = max(sentence_word_distances)
          repetition = min(sentence_word_distances)
      else:
          disconnection = None
          repetition = None
      disconnection_list.append(disconnection)
      repetition_list.append(repetition)
  return disconnection_list, repetition_list

In [None]:
disconnection_list, repetition_list = incongruity(cbow, words_list)

Processing Sentences: 100%|██████████| 200000/200000 [04:53<00:00, 680.77it/s]


In [None]:
incongruity_df = pd.DataFrame({'Disconnection': disconnection_list,
    'Repetition': repetition_list
})

In [None]:
incongruity_df.to_csv('/content/drive/MyDrive/CSC791:NLP/project/incongruity_features.csv')

##### Normalize us option

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split(incongruity_df, labels, test_size=0.7)
model = DecisionTreeClassifier()
model.fit(train_data_X, train_data_Y)
model.score(train_data_X, train_data_Y)

0.9933166666666666

In [None]:
preds = model.predict(test_data_X)
print(classification_report(preds, test_data_Y))


              precision    recall  f1-score   support

       False       0.63      0.64      0.64     68795
        True       0.65      0.64      0.65     71205

    accuracy                           0.64    140000
   macro avg       0.64      0.64      0.64    140000
weighted avg       0.64      0.64      0.64    140000



### Ambiguity
#### 3 Features: Sense combination, closest path similarity, farmos path similarity

In [None]:
# Get POS TAGS for all sentences in words_list
tagged_sentences = []
for sentence in tqdm(words_list):
  tagged_sentence = pos_tag(sentence)
  tagged_sentences.append(tagged_sentence)

100%|██████████| 200000/200000 [03:25<00:00, 974.38it/s]


In [None]:
# Separate lists of POS and append in respective lists
# Noun -> NN/NNS, Verb -> VB, Adjective -> JJ, Adverb -> RP/RB, Numeric -> CD, Determiner -> DT | however D and C do not work in WordNet
pos_tagged_sentences = []
for tagged_words in tqdm(tagged_sentences):
  pos_words = {'NOUN': [], 'VERB': [], 'ADJ': [], 'ADV': [], 'DET': [], 'NUM': []}
  for word, pos in tagged_words:
    if pos.startswith('N'):
        pos_words['NOUN'].append(word)
    elif pos.startswith('V'):
        pos_words['VERB'].append(word)
    elif pos.startswith('J'):
        pos_words['ADJ'].append(word)
    elif pos.startswith('R'):
        pos_words['ADV'].append(word)
  pos_tagged_sentences.append(pos_words)


100%|██████████| 200000/200000 [00:04<00:00, 42905.36it/s]


In [None]:
# Function to get Sentences along with POS Tags
def get_pos_tagged_sentences(words_list):
    """
    """
    tagged_sentences = []
    pos_tagged_sentences = []

    print(f"Getting POS Tags for each sentence")
    for sentence in tqdm(words_list):
        tagged_sentence = pos_tag(sentence)
        tagged_sentences.append(tagged_sentence)

    print(f"Getting POS Tags Lists each sentence")
    for tagged_words in tqdm(tagged_sentences):
        pos_words = {'NOUN': [], 'VERB': [], 'ADJ': [], 'ADV': [], 'DET': [], 'NUM': []}
        for word, pos in tagged_words:
            if pos.startswith('N'):
                pos_words['NOUN'].append(word)
            elif pos.startswith('V'):
                pos_words['VERB'].append(word)
            elif pos.startswith('J'):
                pos_words['ADJ'].append(word)
            elif pos.startswith('R'):
                pos_words['ADV'].append(word)
        pos_tagged_sentences.append(pos_words)

    return pos_tagged_sentences

In [None]:
pos_tagged_sentences = get_pos_tagged_sentences(words_list)

Getting POS Tags for each sentence


100%|██████████| 200000/200000 [03:51<00:00, 864.49it/s]


Getting POS Tags Lists each sentence


100%|██████████| 200000/200000 [00:06<00:00, 32802.23it/s]


In [None]:
sense_combination = 0
for pos, words in pos_tagged_sentences[2].items():
  for word in words:
    synsets = wn.synsets(word, pos=pos[0].lower())
    if synsets:
      num_senses = len(synsets)
      sense_combination += math.log(num_senses)
      # print(sense_combination)
sense_combination = math.exp(sense_combination)
print(sense_combination)

728.0


In [None]:
# Calculate closest and farmost sense of a given word from WordNet SYNSETS by finding all other senses
path_similarities = []
for words in pos_tagged_sentences[2].values():
  for word in words:
    synsets = wn.synsets(word)
    if synsets:
      # for each sense of same word, find similarity
      for synset in synsets:
        # Compare the similarity of our word sense with other word senses (of same word)
        similarities = [synset.path_similarity(other) for other in synsets if other != synset and other.path_similarity(synset)]
        if similarities:
          path_similarities.extend(similarities)

sense_farmost = max(path_similarities) if path_similarities else None
sense_closest = min(path_similarities) if path_similarities else None


0.3333333333333333
0.043478260869565216


In [None]:
# Get sense combiination score
def sense_comination(pos_tagged_sentences):
    sense_combination_list = []
    for sentence in tqdm(pos_tagged_sentences):
        sense_combination = 0
        for pos, words in sentence.items():
            for word in words:
                synsets = wn.synsets(word, pos=pos[0].lower())
                if synsets:
                    num_senses = len(synsets)
                    sense_combination += math.log(num_senses) ##################### CHECK FORMULA ###################
        sense_combination = math.exp(sense_combination)
        sense_combination_list.append(sense_combination)

    return sense_combination_list

sense_combination_score = sense_comination(pos_tagged_sentences)
len(sense_combination_score)

100%|██████████| 200000/200000 [00:30<00:00, 6559.97it/s] 


200000

In [None]:
ambiguity_features_scs_df = pd.DataFrame({'SenseCombScore': sense_combination_score})

ambiguity_features_scs_df.to_csv('/content/drive/MyDrive/CSC791:NLP/project/ambiguity_features_scs.csv')

In [None]:
# # Get Sense Farmost and Closest Path Similarity
# sense_farmost_list = []
# sense_closest_list = []

# for sentence in tqdm(pos_tagged_sentences):
#   path_similarities = []
#   for words in sentence.values():
#     for word in words:
#       synsets = wn.synsets(word)
#       if synsets:
#         # for each sense of same word, find similarity
#         for synset in synsets:
#           # Compare the similarity of our word sense with other word senses (of same word)
#           similarities = [synset.path_similarity(other) for other in synsets if other != synset and other.path_similarity(synset)]
#           if similarities:
#             path_similarities.extend(similarities)
#   sense_farmost = max(path_similarities) if path_similarities else None
#   sense_closest = min(path_similarities) if path_similarities else None
#   sense_farmost_list.append(sense_farmost)
#   sense_closest_list.append(sense_closest)

### Phonetic Style

### For all

In [7]:
import collections
d = cmudict.dict()

# Function to get Phonetic representations of word
def get_phonemes(word):
    """
    Get phonetic representation of a word from CMU Pronouncing Dictionary
    """
    return d[word][0] if word in d else None


def get_alliteration_rhyme_chains(words):
    """
    Returns
    For exploration
    ---------
    start_phenome : dictionary containing all the starting phonemes
    end_phenome : dictionary containing all the ending phonemes
    ---------

    The features
    -----
    len(start_phenome.values()) : the total number of starting phonemes -> number of alliteration chains
    len(end_phenome.values()) : the total number of ending phonemes -> number of rhyme chains
    max(start_phenome.values()) : The max values of the alliteration string -> maximum length of alliteration chains
    max(end_phenome.values()) : The max values of the alliteration string -> maximum length of rhyme chains
    ------
    """
    start_phenome = collections.defaultdict(int)
    end_phenome = collections.defaultdict(int)
    for word in words:
        phonemes = get_phonemes(word)
        if phonemes:
            start_phenome[phonemes[0]]+=1
            end_phenome[phonemes[-1]]+=1
    if start_phenome and end_phenome:
        return start_phenome, end_phenome, len(start_phenome.values()), len(end_phenome.values()), max(start_phenome.values()), max(end_phenome.values())
    elif start_phenome:
        return start_phenome, end_phenome, len(start_phenome.values()), len(end_phenome.values()), max(start_phenome.values()), 0
    elif start_phenome:
        return start_phenome, end_phenome, len(start_phenome.values()), len(end_phenome.values()), 0, max(end_phenome.values())
    else:
        return start_phenome, end_phenome, len(start_phenome.values()), len(end_phenome.values()), 0, 0



In [8]:
phonetic_style_features = []
sps = []
eps = []
for sentence in tqdm(jokes):
    sp, ep, num_alliteration_chains, num_rhyme_chains, max_all_chain, max_rhyme_chain = get_alliteration_rhyme_chains(sentence.split(' '))
    sps.append(sp)
    eps.append(ep)
    phonetic_style_features.append((num_alliteration_chains, num_rhyme_chains, max_all_chain, max_rhyme_chain))

100%|██████████| 200000/200000 [00:04<00:00, 49908.41it/s]


In [10]:
len(phonetic_style_features)

200000

In [11]:
# alliteration, max_alliteration, rhyme, max_rhyme
column_names = ['num_alliteration_chains', 'num_rhyme_chains', 'max_all_chain', 'max_rhyme_chain']
ps_df = pd.DataFrame(phonetic_style_features, columns=column_names)
ps_df

Unnamed: 0,num_alliteration_chains,num_rhyme_chains,max_all_chain,max_rhyme_chain
0,5,4,1,2
1,5,6,2,1
2,7,5,1,2
3,6,5,1,2
4,6,7,2,2
...,...,...,...,...
199995,5,4,1,2
199996,8,6,2,3
199997,9,9,2,2
199998,7,7,1,1


In [12]:
# Get unique values using set()
unique_values = list(set(ps_df['num_alliteration_chains']))

# Print the unique values
print("Unique values:", unique_values)

Unique values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [13]:
# Get unique values using set()
unique_values = list(set(ps_df['num_rhyme_chains']))

# Print the unique values
print("Unique values:", unique_values)

Unique values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [14]:
# Get unique values using set()
unique_values = list(set(ps_df['max_all_chain']))

# Print the unique values
print("Unique values:", unique_values)

Unique values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]


In [15]:
# Get unique values using set()
unique_values = list(set(ps_df['max_rhyme_chain']))

# Print the unique values
print("Unique values:", unique_values)

Unique values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [16]:
ps_df.to_csv('/content/drive/MyDrive/CSC791:NLP/project/phonetic_features.csv')

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


train_data_X, test_data_X, train_data_Y, test_data_Y = train_test_split(ps_df, labels, test_size=0.7)
model = DecisionTreeClassifier()
model.fit(train_data_X, train_data_Y)
model.score(train_data_X, train_data_Y)

0.6426833333333334

In [18]:
preds = model.predict(test_data_X)
print(classification_report(preds, test_data_Y))

              precision    recall  f1-score   support

       False       0.77      0.60      0.68     89176
        True       0.50      0.68      0.57     50824

    accuracy                           0.63    140000
   macro avg       0.63      0.64      0.63    140000
weighted avg       0.67      0.63      0.64    140000



In [19]:
ambiguity_features_scs_df = pd.read_csv('/content/drive/MyDrive/CSC791:NLP/project/ambiguity_features_scs.csv')
ambiguity_features_pthsim_df = pd.read_csv('/content/drive/MyDrive/CSC791:NLP/project/ambiguity_features_pths.csv')
incngruity_features_df = pd.read_csv('/content/drive/MyDrive/CSC791:NLP/project/incongruity_features.csv')
phonetic_features_df = pd.read_csv('/content/drive/MyDrive/CSC791:NLP/project/phonetic_features.csv')

In [21]:
semantic_features = pd.DataFrame({
    'disconnection': incngruity_features_df['Disconnection'],
    'repetition': incngruity_features_df['Repetition'],
    'sense_combination_score': ambiguity_features_scs_df['SenseCombScore'],
    'farmost_path': ambiguity_features_pthsim_df['farmost_path'],
    'closest_path': ambiguity_features_pthsim_df['closest_path'],
    'alliteration': phonetic_features_df['num_alliteration_chains'],
    'max_alliteration': phonetic_features_df['max_all_chain'],
    'rhyme': phonetic_features_df['num_rhyme_chains'],
    'max_rhyme': phonetic_features_df['max_rhyme_chain']
})
semantic_features

Unnamed: 0,disconnection,repetition,sense_combination_score,farmost_path,closest_path,alliteration,max_alliteration,rhyme,max_rhyme
0,0.941129,-0.256623,59040.0,0.500000,0.050000,5,1,4,2
1,0.826847,-0.003797,31680.0,0.500000,0.045455,5,2,6,1
2,0.818388,-0.071782,123760.0,0.333333,0.043478,7,1,5,2
3,0.965455,0.019324,480.0,0.333333,0.062500,6,1,5,2
4,0.913695,-0.570869,880.0,0.500000,0.047619,6,2,7,2
...,...,...,...,...,...,...,...,...,...
199995,0.957222,-0.028747,378.0,0.500000,0.050000,5,1,4,2
199996,0.868705,-0.034454,8820.0,0.500000,0.050000,8,2,6,3
199997,0.947203,-0.125161,2730.0,0.500000,0.041667,9,2,9,2
199998,0.826061,-0.173908,40.0,0.333333,0.058824,7,1,7,1


In [22]:
semantic_features.to_csv('/content/drive/MyDrive/CSC791:NLP/project/semantic_features_scs.csv')

In [None]:
# Get unique values using set()
unique_values = set(semantic_features['max_rhyme'].to_list())

# Print the unique values
print("Unique values:", unique_values)

In [None]:
# Get unique values using set()
unique_values = set(semantic_features['max_alliteration'].to_list())

# Print the unique values
print("Unique values:", unique_values)

Unique values: {0, 1, 2, 3, 4}


In [None]:
# Get unique values using set()
unique_values = set(semantic_features['rhyme'].to_list())

# Print the unique values
print("Unique values:", unique_values)

Unique values: {0, 1, 2, 3, 4, 5}


In [None]:
# Get unique values using set()
unique_values = set(semantic_features['alliteration'].to_list())

# Print the unique values
print("Unique values:", unique_values)

Unique values: {0, 1, 2, 3, 4}


In [None]:
# Get unique values using set()
unique_values = set(semantic_features['closest_path'].to_list())

# Print the unique values
print("Unique values:", unique_values)

Unique values: {0.0625, 0.0666666666666666, 0.0714285714285714, 0.25, 0.125, 0.1111111111111111, 0.1428571428571428, 0.3333333333333333, 0.2, 0.0357142857142857, nan, 0.5, nan, nan, nan, nan, nan, 0.05, 0.0434782608695652, 0.0909090909090909, nan, nan, nan, nan, 0.037037037037037, 0.0384615384615384, 0.04, nan, nan, nan, nan, nan, 0.0769230769230769, 0.0555555555555555, 0.0416666666666666, 0.1, 0.1666666666666666, nan, nan, nan, 0.0588235294117647, 0.0526315789473684, nan, nan, nan, nan, nan, nan, 0.0476190476190476, 0.0454545454545454, 0.0344827586206896, 0.0833333333333333, nan, nan}
