In [2]:
from gensim.models import KeyedVectors
import statistics

# Load the pre-trained Word2Vec model (about 3.6 GB in size)
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
def is_stop_word(word):
    stop_word_list = [
    'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 
    'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 
    'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 
    'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
    'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
    'such', 'no', 'nor', 'not', 'only', 'same', 'so', 'than', 'too', 
    'very', 's', 't', 'can', 'will', 'just', 'should', 'now', 'without', 'with']
    
    return word in stop_word_list

In [4]:
def cleanup_word(word):
    cleaned_up_word = word.replace(".", "").replace("-", "")
    return cleaned_up_word.strip()

In [5]:
def get_seasonality_score(poem):
    poem_words = poem.split()
    # https://mainichi.jp/english/articles/20210531/p2a/00m/0su/032000c
    season_words = ['winter', 'spring', 'summer','autumn']
    seasonality_score = -1
    for raw_word in poem_words:
        word = cleanup_word(raw_word)
        for season_word in season_words:
            if word in model and season_word in model and not is_stop_word(word):
                cosine_similarity = model.similarity(word, season_word)
                seasonality_score = max(cosine_similarity, seasonality_score)
    return seasonality_score

In [6]:
def get_human_reference_score(poem):
    poem_words = poem.split()
    me_words = ['i', 'me', 'my','mine', 'he','she',
                'they','our','his','her','him','their','we']
    human_reference_scores = []
    for raw_word in poem_words:
        word = cleanup_word(raw_word)
        max_similarity = 0
        for me_word in me_words:
            if word in model and me_word in model and not is_stop_word(word):
                max_similarity = max(max_similarity, model.similarity(word, me_word))
        human_reference_scores.append(max_similarity)
    return statistics.mean(human_reference_scores)

In [7]:
def get_natureness_score(poem):
    poem_words = poem.split()
    nature_words = ['nature', 'wild', 'bird', 'tree', 'river', 'leaf',
                    'mountain', 'flower','plant', 'wind', 'rain', 'forest']
    natureness_scores = []
    for raw_word in poem_words:
        word = cleanup_word(raw_word)
        for nature_word in nature_words:
            if word in model and nature_word in model and not is_stop_word(word):
                cosine_similarity = model.similarity(word, nature_word)
                natureness_scores.append(cosine_similarity)
    return statistics.mean(natureness_scores)

In [8]:
def get_average(word_vectors):
    sums = [0] * len(word_vectors[0])
    
    for word_vector in word_vectors:
        for i in range(len(word_vector)):
            sums[i] += word_vector[i]
    return [total/len(word_vectors) for total in sums]

In [9]:
def split_and_get_average_vector(poem):
    words = poem.split()
    word_vectors = []
    for word in words:
        processed_word = cleanup_word(word)
        if processed_word in model and not is_stop_word(processed_word):
            word_vectors.append(model[processed_word])
            
    return get_average(word_vectors)

In [10]:
# Humor detection
from transformers import pipeline

# Load a text generation model
detect_humor = pipeline(model="shivapbhusal/haiku_humor",task="sentiment-analysis",tokenizer="distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [11]:
def get_attributes_for_all_poems(data_file):
    with open(data_file, 'r') as file:
        data = file.read()

    poems = [poem.strip() for poem in data.split('\n\n') if poem.strip()]
    attributes = [] # List of dictionaries.
    for poem in poems:
        poem_one_liner = " ".join(poem.splitlines())
        poem_one_liner = poem_one_liner.lower()
        # Get Humor Score
        humor_content = detect_humor(poem_one_liner)
        humor_score = humor_content[0]["score"] if humor_content[0]["label"] else 1 - humor_content[0]["score"]
        
        #Get Natureness Score
        natureness_score = get_natureness_score(poem_one_liner)
        
        #Get Seasonality Score
        seasonality_score = get_seasonality_score(poem_one_liner)
        
        #Get Human Reference Score
        self_score = get_human_reference_score(poem_one_liner)
        
        # Get General Score
        average_vector = split_and_get_average_vector(poem_one_liner)
        
        attributes.append({"poem": poem_one_liner,
                           "humor_score": humor_score,
                           "natureness_score": natureness_score,
                           "seasonality_score": seasonality_score,
                           "human_reference_score": self_score,
                           "average_vector": average_vector})
    return attributes

In [12]:
senryu_data = get_attributes_for_all_poems('data/senryu.txt')
haiku_data = get_attributes_for_all_poems('data/haiku.txt')

In [13]:
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.exceptions import FitFailedWarning
import warnings

# To suppress the warning and show only relevant messages
warnings.simplefilter('ignore', FitFailedWarning)

# Prepare your data
senryu_metrics = [[attribute["humor_score"],
                  attribute["natureness_score"],
                  attribute["seasonality_score"],
                  attribute["human_reference_score"]] + attribute["average_vector"] for attribute in senryu_data]
                  
haiku_metrics = [[attribute["humor_score"],
                  attribute["natureness_score"],
                  attribute["seasonality_score"],
                  attribute["human_reference_score"]] + attribute["average_vector"] for attribute in haiku_data]

training_set = senryu_metrics + haiku_metrics
labels = [0] * len(senryu_metrics) + [1] * len(haiku_metrics)  # 0 for senryu, 1 for haiku

# Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(training_set, labels, test_size=0.2, random_state=43)

# Define the parameter grid with conditional parameters
param_grid = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]},
    {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto']},
    {'kernel': ['poly'], 'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto']},
    {'kernel': ['sigmoid'], 'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto']}
]

# Set up the SVM model
svc = svm.SVC(probability=True)

# Apply GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')

# Train the model using Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and classifier
best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_

# Predict on the test data
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Best parameters: {best_params}")
print(f"Accuracy: {accuracy * 100}%")

# Detailed classification report
print(classification_report(y_test, y_pred))

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 75.98425196850394%
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       252
           1       0.77      0.75      0.76       256

    accuracy                           0.76       508
   macro avg       0.76      0.76      0.76       508
weighted avg       0.76      0.76      0.76       508



In [44]:
poem_kerouac = "birds singing in the dark -- rainy dawn"
poem_rotella = "She running for office — for the first time my neighbor waves"
poem_virgilio = "autumn twilight:the wreath on the door lifts in the wind"
poem_swede = "alone at last i wonder where everyone is"

poems = [poem_kerouac, poem_rotella, poem_virgilio, poem_swede]
attributes = []

for poem in poems:
    poem_one_liner = poem.lower()
    
    # Get Humor Score
    humor_content = detect_humor(poem_one_liner)
    humor_score = humor_content[0]["score"] if humor_content[0]["label"] else 1 - humor_content[0]["score"]
        
    # Get Natureness Score
    natureness_score = get_natureness_score(poem_one_liner)
        
    # Get Seasonality Score
    seasonality_score = get_seasonality_score(poem_one_liner)
        
    # Get Human Reference Score
    human_reference_score = get_human_reference_score(poem_one_liner)
        
    # Get General Score
    average_vector = split_and_get_average_vector(poem_one_liner)
        
    attributes.append([humor_score, natureness_score, seasonality_score, human_reference_score] + average_vector)

# Predict class labels
prediction = best_clf.predict(attributes)

# Predict probabilities (percentage likelihood for each class)
probabilities = best_clf.predict_proba(attributes)

# Print predictions and probabilities
print("Predictions:", prediction)
print("Probabilities:", probabilities)
print([attribute[0:4] for attribute in attributes])

Predictions: [1 0 1 0]
Probabilities: [[0.19117294 0.80882706]
 [0.88964181 0.11035819]
 [0.26385553 0.73614447]
 [0.84589984 0.15410016]]
[[0.022747457027435303, 0.16689484, 0.46266505, 0.10701379], [0.9091053009033203, 0.09049404, 0.36718187, 0.27351004], [0.022456109523773193, 0.14482306, 1.0, 0.08248165], [0.22774267196655273, 0.069834016, 0.3275469, 0.32577872]]
