Data cleaning For Haikugen project.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
# Text preprocessing and modelling
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors

In [4]:
df = pd.read_csv('dataset/all_haiku.csv')
print(df.head())

   Unnamed: 0                0                      1                 2  \
0           0    fishing boats              colors of       the rainbow   
1           1  ash wednesday--    trying to remember           my dream   
2           2     snowy morn--    pouring another cup   of black coffee   
3           3     shortest day           flames dance       in the oven   
4           4             haze  half the horse hidden  behind the house   

        source                                     hash  
0  tempslibres           FISHINGBOATSCOLORSOFTHERAINBOW  
1  tempslibres      ASHWEDNESDAYTRYINGTOREMEMBERMYDREAM  
2  tempslibres  SNOWYMORNPOURINGANOTHERCUPOFBLACKCOFFEE  
3  tempslibres          SHORTESTDAYFLAMESDANCEINTHEOVEN  
4  tempslibres     HAZEHALFTHEHORSEHIDDENBEHINDTHEHOUSE  


In [5]:

def preprocess_text(text):
  # To ensure text is a string and removing Special Characters
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    else:
        text = np.nan
    
    return text

df['combined_haiku'] = df['0'].astype(str) + ' ' + df['1'].astype(str) + ' ' + df['2'].astype(str)

#preprocessing to each combined haiku
df['processed_haiku'] = df['combined_haiku'].apply(preprocess_text)

# Dropping rows with NaN 
df.dropna(subset=['processed_haiku'], inplace=True)

print(df[['combined_haiku', 'processed_haiku']].head())


                                     combined_haiku  \
0               fishing boats colors of the rainbow   
1      ash wednesday-- trying to remember  my dream   
2  snowy morn-- pouring another cup of black coffee   
3             shortest day flames dance in the oven   
4       haze half the horse hidden behind the house   

                                  processed_haiku  
0             fishing boats colors of the rainbow  
1      ash wednesday trying to remember  my dream  
2  snowy morn pouring another cup of black coffee  
3           shortest day flames dance in the oven  
4     haze half the horse hidden behind the house  


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.85, max_features=5000, stop_words='english')

# Fit and transform the processed haikus into vectors
tfidf_matrix = vectorizer.fit_transform(df['processed_haiku'])

print(tfidf_matrix.shape)



(144123, 5000)


The tfidf matrix is huge, so here i have created a represention of haikus using Word Embeddings. I have used the Word2Vec model from the gensim library.

In [20]:

vectorizer = CountVectorizer(max_features=5000, stop_words='english')
# Fit and transform the processed haikus
word_count_matrix = vectorizer.fit_transform(df['processed_haiku'])
# Convert the word count matrix to a DataFrame
word_count_df = pd.DataFrame(word_count_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Saving to Csv
word_count_df.to_csv('dataset/word_count_matrix.csv', index=False)


In [6]:
# Tokenizing
df['tokenized_haiku'] = df['processed_haiku'].apply(lambda x: x.split())


In [9]:
# I have loaded the Word2vec pretrained model from google
word2vec_model = KeyedVectors.load_word2vec_format('genmi/GoogleNews-vectors-negative300.bin', binary=True)


In [10]:

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

df['word2vec_vector'] = df['tokenized_haiku'].apply(lambda x: get_average_word2vec(x, word2vec_model))


In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,source,hash,combined_haiku,processed_haiku,tokenized_haiku,word2vec_vector
0,0,fishing boats,colors of,the rainbow,tempslibres,FISHINGBOATSCOLORSOFTHERAINBOW,fishing boats colors of the rainbow,fishing boats colors of the rainbow,"[fishing, boats, colors, of, the, rainbow]","[0.06935628255208333, 0.11063639322916667, 0.0..."
1,1,ash wednesday--,trying to remember,my dream,tempslibres,ASHWEDNESDAYTRYINGTOREMEMBERMYDREAM,ash wednesday-- trying to remember my dream,ash wednesday trying to remember my dream,"[ash, wednesday, trying, to, remember, my, dream]","[0.09874834333147321, 0.00020926339285714285, ..."
2,2,snowy morn--,pouring another cup,of black coffee,tempslibres,SNOWYMORNPOURINGANOTHERCUPOFBLACKCOFFEE,snowy morn-- pouring another cup of black coffee,snowy morn pouring another cup of black coffee,"[snowy, morn, pouring, another, cup, of, black...","[-0.02301025390625, 0.065765380859375, -0.0730..."
3,3,shortest day,flames dance,in the oven,tempslibres,SHORTESTDAYFLAMESDANCEINTHEOVEN,shortest day flames dance in the oven,shortest day flames dance in the oven,"[shortest, day, flames, dance, in, the, oven]","[0.04017857, -0.027413504, 0.06512451, 0.06622..."
4,4,haze,half the horse hidden,behind the house,tempslibres,HAZEHALFTHEHORSEHIDDENBEHINDTHEHOUSE,haze half the horse hidden behind the house,haze half the horse hidden behind the house,"[haze, half, the, horse, hidden, behind, the, ...","[0.11157799, 0.04814911, 0.011920929, 0.004920..."
