In [39]:
import pandas as pd
#for better viewing of tables
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None)

In [40]:
#loading friends_dev
dev_data = pd.read_csv('friends_dev.csv')

In [41]:
dev_data.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Phoebe,"Oh my God, hes lost it. Hes totally lost it.",non-neutral,2120
1,Monica,What?,surprise,1000130
2,Ross,"Or! Or, we could go to the bank, close our accounts and cut them off at the source.",neutral,3000200
3,Chandler,Youre a genius!,joy,500000
4,Joey,"Aww, man, now we wont be bank buddies!",sadness,40100


In [42]:
#loading friends_test
test_data = pd.read_csv('friends_test.csv')

In [43]:
test_data.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Mark,Why do all youre coffee mugs have numbers on the bottom?,surprise,2000030
1,Rachel,"Oh. Thats so Monica can keep track. That way if one on them is missing, she can be like, Wheres number 27?!",non-neutral,2100011
2,Rachel,Y'know what?,neutral,3000020
3,Ross,It didnt.,neutral,5000000
4,Frank,"Okay, so what you used to have with Rachel, is what Ive got with Alice.",joy,1300010


In [44]:
#loading friends_train
train_data = pd.read_csv('friends_train.csv', index_col=0)

In [45]:
train_data.head()

Unnamed: 0,speaker,utterance,emotion,annotation
0,Chandler,also I was the point person on my companys transition from the KL-5 to GR-6 system.,neutral,4100000
1,The Interviewer,You mustve had your hands full.,neutral,5000000
2,Chandler,That I did. That I did.,neutral,5000000
3,The Interviewer,So lets talk a little bit about your duties.,neutral,5000000
4,Chandler,My duties? All right.,surprise,2000030


In [46]:
train_data.shape

(10561, 4)

In [47]:
#libraries needed for preprocessing of data
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

In [48]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Student.DESKTOP-
[nltk_data]     6DE327B\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Student.DESKTOP-
[nltk_data]     6DE327B\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Student.DESKTOP-
[nltk_data]     6DE327B\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [49]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [50]:
#preprocessing
def preprocess_dataframe(data):
    def preprocess_text(text):
        #tokenizing
        tokens = word_tokenize(text)
        #lowercasing and removing special characters
        tokens = [word.lower() for word in tokens if word.isalpha()]
        #removing stopwords
        tokens = [word for word in tokens if word not in stop_words]
        #lemmatizing and stemming
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

    data['preprocessed_text'] = data['utterance'].apply(preprocess_text)
    
    #removing rows with empty preprocessed text
    data = data[data['preprocessed_text'].apply(lambda x: len(x.strip()) > 0)]
    
    return data

In [51]:
#preprocessing all the three dataframes
train_data_preprocessed = preprocess_dataframe(train_data)
test_data_preprocessed = preprocess_dataframe(test_data)
dev_data_preprocessed = preprocess_dataframe(dev_data)

In [52]:
train_data_preprocessed.head()

Unnamed: 0,speaker,utterance,emotion,annotation,preprocessed_text
0,Chandler,also I was the point person on my companys transition from the KL-5 to GR-6 system.,neutral,4100000,also point person transit system
1,The Interviewer,You mustve had your hands full.,neutral,5000000,hand full
3,The Interviewer,So lets talk a little bit about your duties.,neutral,5000000,talk littl bit duti
4,Chandler,My duties? All right.,surprise,2000030,duti right
5,The Interviewer,"Now youll be heading a whole division, so youll have a lot of duties.",neutral,5000000,head whole divis lot duti


In [53]:
test_data_preprocessed.head()

Unnamed: 0,speaker,utterance,emotion,annotation,preprocessed_text
0,Mark,Why do all youre coffee mugs have numbers on the bottom?,surprise,2000030,coffe mug number bottom
1,Rachel,"Oh. Thats so Monica can keep track. That way if one on them is missing, she can be like, Wheres number 27?!",non-neutral,2100011,oh monica keep track way one miss like number
4,Frank,"Okay, so what you used to have with Rachel, is what Ive got with Alice.",joy,1300010,okay use rachel got alic
5,Joey,"Now, wh-what, what is that like?",surprise,1000040,like
6,Frank,"Its so cool man, its so, its just cause being with her is so much better than like not being with her.",joy,2300000,cool man much better like


In [54]:
dev_data_preprocessed.head()

Unnamed: 0,speaker,utterance,emotion,annotation,preprocessed_text
0,Phoebe,"Oh my God, hes lost it. Hes totally lost it.",non-neutral,2120,oh god lost total lost
2,Ross,"Or! Or, we could go to the bank, close our accounts and cut them off at the source.",neutral,3000200,could go bank close account cut sourc
3,Chandler,Youre a genius!,joy,500000,geniu
4,Joey,"Aww, man, now we wont be bank buddies!",sadness,40100,aww man bank buddi
5,Chandler,"Now, theres two reasons.",neutral,4000010,two reason


In [55]:
#vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
#vectorization
tfidf_vectorizer = TfidfVectorizer()

In [57]:
#vectorization
train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_data_preprocessed['preprocessed_text'])
test_tfidf_matrix = tfidf_vectorizer.transform(test_data_preprocessed['preprocessed_text'])
dev_tfidf_matrix = tfidf_vectorizer.transform(dev_data_preprocessed['preprocessed_text'])

In [58]:
#vectorization
train_tfidf_array = train_tfidf_matrix.toarray()
test_tfidf_array = test_tfidf_matrix.toarray()
dev_tfidf_array = dev_tfidf_matrix.toarray()

In [93]:
print("training data:", train_tfidf_matrix.shape)
print("testing data:", test_tfidf_matrix.shape)
print("development data:", dev_tfidf_matrix.shape)

training data: (9881, 3993)
testing data: (2603, 3993)
development data: (1092, 3993)


In [62]:
#SVM Classifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

#default hyperparameters
svm_classifier_default = SVC()

#training with friends_train
svm_classifier_default.fit(train_tfidf_matrix, train_data_preprocessed['emotion'])

#predicting on friends_test
predictions_default = svm_classifier_default.predict(test_tfidf_matrix)

#accuracy and report
accuracy_default = accuracy_score(test_data_preprocessed['emotion'], predictions_default)
classification_report_default = classification_report(test_data_preprocessed['emotion'], predictions_default)

print("SVM Classifier without Optimization:")
print("Accuracy:", accuracy_default)
print("Classification Report:")
print(classification_report_default)


SVM Classifier without Optimization:
Accuracy: 0.501728774490972
Classification Report:
              precision    recall  f1-score   support

       anger       0.20      0.01      0.01       154
     disgust       0.67      0.06      0.11        66
        fear       0.00      0.00      0.00        30
         joy       0.54      0.19      0.28       295
     neutral       0.51      0.93      0.66      1214
 non-neutral       0.27      0.09      0.13       515
     sadness       0.64      0.11      0.18        84
    surprise       0.67      0.23      0.34       245

    accuracy                           0.50      2603
   macro avg       0.44      0.20      0.22      2603
weighted avg       0.47      0.50      0.41      2603



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [63]:
#SVM Classifier with optimization
from sklearn.model_selection import GridSearchCV

#custom hyperparameter for optimization
parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

svm_classifier_optimized = SVC()

#grid search to find best hyperparameters
grid_search = GridSearchCV(svm_classifier_optimized, parameters, cv=5)
grid_search.fit(train_tfidf_matrix, train_data_preprocessed['emotion'])
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

#using best hyperparameters for initializing SVM
best_svm_classifier = SVC(**best_params)

#training with friends_train
best_svm_classifier.fit(train_tfidf_matrix, train_data_preprocessed['emotion'])

#predicting on friends_test
predictions_optimized = best_svm_classifier.predict(test_tfidf_matrix)

#accuracy and report
accuracy_optimized = accuracy_score(test_data_preprocessed['emotion'], predictions_optimized)
classification_report_optimized = classification_report(test_data_preprocessed['emotion'], predictions_optimized)

print("\nSVM Classifier with Optimization:")
print("Accuracy:", accuracy_optimized)
print("Classification Report:")
print(classification_report_optimized)

Best hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

SVM Classifier with Optimization:
Accuracy: 0.501728774490972
Classification Report:
              precision    recall  f1-score   support

       anger       0.20      0.01      0.01       154
     disgust       0.67      0.06      0.11        66
        fear       0.00      0.00      0.00        30
         joy       0.54      0.19      0.28       295
     neutral       0.51      0.93      0.66      1214
 non-neutral       0.27      0.09      0.13       515
     sadness       0.64      0.11      0.18        84
    surprise       0.67      0.23      0.34       245

    accuracy                           0.50      2603
   macro avg       0.44      0.20      0.22      2603
weighted avg       0.47      0.50      0.41      2603



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
#cross-validation report
from sklearn.model_selection import cross_val_score, cross_val_predict

#assessing the classifier's performance
cv_scores = cross_val_score(best_svm_classifier, train_tfidf_matrix, train_data_preprocessed['emotion'], cv=5)
print("Cross-validation scores:", cv_scores)

#cross-validated predictions
cv_predictions = cross_val_predict(best_svm_classifier, train_tfidf_matrix, train_data_preprocessed['emotion'], cv=5)

cv_classification_report = classification_report(train_data_preprocessed['emotion'], cv_predictions)

print("\nClassification Report (Cross-Validation):")
print(cv_classification_report)

Cross-validation scores: [0.49570056 0.48279352 0.49240891 0.48836032 0.49797571]

Classification Report (Cross-Validation):
              precision    recall  f1-score   support

       anger       0.24      0.01      0.02       481
     disgust       0.41      0.03      0.06       232
        fear       0.00      0.00      0.00       171
         joy       0.55      0.17      0.26      1250
     neutral       0.49      0.94      0.65      4431
 non-neutral       0.34      0.08      0.13      1928
     sadness       0.50      0.11      0.19       343
    surprise       0.64      0.24      0.35      1045

    accuracy                           0.49      9881
   macro avg       0.40      0.20      0.21      9881
weighted avg       0.46      0.49      0.39      9881

