In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("PERC_mendelly.csv")

#df = pd.read_csv("final_trial.csv")
df = df.dropna()

# Preprocess the text
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

df['clean_text'] = df['Poem'].apply(preprocess)

# Split the poem column into individual stanzas
df['stanzas'] = df['Poem'].apply(sent_tokenize)

# Convert emotions column to numerical labels
emotions = ['love', 'sad', 'anger', 'hate', 'fear', 'surprise', 'courage', 'joy', 'peace',"hope",'care']
df['emotion_label'] = df['Emotion'].apply(emotions.index)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['stanzas'], df['emotion_label'], test_size=0.2, random_state=42)

# Extract features from the stanzas using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform([' '.join(stanza) for stanza in X_train])
X_test_tfidf = vectorizer.transform([' '.join(stanza) for stanza in X_test])

# Train a Passive Aggressive Classifier
model = PassiveAggressiveClassifier(C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Save the trained model
# joblib.dump(model, 'pac_perc.joblib')

# Evaluate the performance of the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100,"%")

# Use the trained model to predict the emotion of each stanza in a new poem
new_poem = """
I have found the secret of loving you.
"""
new_stanzas = sent_tokenize(new_poem)

for stanza in new_stanzas:
    lines = stanza.strip().split('\n')
    line_emotions = []
    for line in lines:
        line_tfidf = vectorizer.transform([preprocess(line)])
        line_emotion = emotions[model.predict(line_tfidf)[0]]
        print("Emotion lable: ", model.predict(line_tfidf)[0])
        line_emotions.append(line_emotion)
    print('Stanza:')
    print(stanza)
    print('Emotions:')
    print(line_emotions)
    print("\n")
from sklearn.metrics import confusion_matrix

# ...

# Evaluate the performance of the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100,"%")

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy: 45.20547945205479 %
Emotion lable:  0
Stanza:

I have found the secret of loving you.
Emotions:
['love']


Accuracy: 45.20547945205479 %
Confusion Matrix:
[[20  6  1  0  0  0  0  2  3]
 [ 9 13  1  1  0  0  2  8  1]
 [ 2  0  6  0  0  0  1  0  0]
 [ 0  0  0  2  0  0  0  0  0]
 [ 2  1  0  0  1  0  3  1  0]
 [ 0  0  0  0  0  2  0  1  0]
 [ 2  0  1  0  1  0  8  2  2]
 [ 7  6  0  0  0  0  2 12  2]
 [ 6  1  0  0  0  0  0  3  2]]


In [None]:
impo