In [1]:
import numpy as np
import pandas as pd
from senticnet5 import senticnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from scipy import sparse
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_excel('hand8_k_random.xlsx')
print(len(df))

8501


In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import ClassifierChain
from skmultilearn.ensemble import RakelD

In [4]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
X = df['Text']
Y = df[['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']]

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Text'])
print(len(vectorizer.vocabulary_))

17402


In [6]:
kf = KFold(n_splits = 10)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [7]:
def evaluation(score_list,predict_score_list):
    filter_corr = []
    exmatch = 0
    atleast1 = 0
    md1 = 0
    one_f = 0
    more_f = 0
    zero_f = 0
    sm = 0
    sdensity = 0
    hammval = 0
    test_len = len(predict_score_list[0])
    for j in range(test_len):
        cnt=0
        for i in range(8):
            hammval+=(score_list[i][j] ^ int(predict_score_list[i][j]))
            if(score_list[i][j]==1):
                cnt+=1
                sm+=1
        sdensity+=cnt/8
        if(cnt==0):
            zero_f+=1
        if(cnt==1):
            one_f+=1
        if(cnt>1):
            more_f+=1
        for i in range(8):
            mf = True
            if(int(predict_score_list[i][j])!=score_list[i][j]):
                mf=False
                break
        if(mf==True):
            exmatch+=1
            filter_corr.append(j)
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                atleast1+=1
                break
        mf = False
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                if(mf==True):
                    md1+=1
                    filter_corr.append(j)
                    break
                mf=True
    #print("Label Cardinality: "+ str(sm/test_len))
    #print("Label Density: "+ str(sdensity/test_len))
    print("Hamming Loss: "+str(hammval/(test_len*8)))
    hamlos = hammval/(test_len*8)
    print("Exact Prediction: "+str(exmatch/test_len))
    sub_accu = exmatch/test_len
    #print("At least one label predicted: "+str(atleast1/(test_len-zero_f)))
    #print("More than one label predicted: "+str(md1/more_f))
    tp_sum = 0
    fp_sum = 0
    fn_sum = 0
    macro_preci = 0
    macro_recall = 0
    macro_f1 = 0
    for i in range(len(score_list)):
        tmp = confusion_matrix(score_list[i],predict_score_list[i])
        tp_sum+=tmp[0][0]
        fp_sum+=tmp[0][1]
        fn_sum+=tmp[1][0]
        macro_preci_tmp=tmp[0][0]/(tmp[0][0]+tmp[0][1])
        macro_recall_tmp=tmp[0][0]/(tmp[0][0]+tmp[1][0])
        macro_f1 += ((2*macro_preci_tmp*macro_recall_tmp)/(macro_preci_tmp+macro_recall_tmp))
        macro_preci+=macro_preci_tmp
        macro_recall+=macro_recall_tmp
        #print(macro_f1)
    micro_preci = tp_sum/(tp_sum+fp_sum)
    micro_recall = tp_sum/(tp_sum+fn_sum)
    micro_f1 = (2*micro_preci*micro_recall)/(micro_preci+micro_recall)
    macro_preci/=8
    macro_recall/=8
    macro_f1/=8
    #print(micro_preci,micro_recall,micro_f1)
    #print(macro_preci,macro_recall,macro_f1)
    print("Macro F-Score: "+str(macro_f1))
    print("Micro F-Score: "+str(micro_f1))
    col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
    tmp = 0
    for i in range(len(score_list)):
        score = accuracy_score(score_list[i],predict_score_list[i]) 
        #print(col_names[i]+" accuracy: "+str(score))
        tmp += score
    print("Average Accuracy: " + str(tmp/8))
    avg_accu = tmp/8
    return (hamlos,sub_accu,macro_f1,micro_f1,avg_accu)

In [8]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
hamm_score = []
subset_accu = []
macro_f1 = []
micro_f1 = []
avg_accu = []
cnt = 1
for train_index,test_index in kf.split(X):
    x_train,x_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = np.array(Y.iloc[train_index].values.tolist()),np.array(Y.iloc[test_index].values.tolist())
    print("k_fold validation: " + str(cnt))
    cnt+=1
    
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    
    print(x_train.shape,x_test.shape)
    print(y_train.shape,y_test.shape)
    
    base = RandomForestClassifier()
    chain_rfc = ClassifierChain(base,order = 'random',random_state=0)
    chain_rfc.fit(x_train,y_train)
    y_pred = chain_rfc.predict(x_test)
    
    y_pred_val = y_pred
    y_test_val = np.array(y_test)
    
    score_list = y_test_val.T.tolist()
    predict_score_list = y_pred_val.T.tolist()
    
    ret = evaluation(score_list,predict_score_list)
    hamm_score.append(ret[0])
    subset_accu.append(ret[1])
    macro_f1.append(ret[2])
    micro_f1.append(ret[3])
    avg_accu.append(ret[4])
    print('\n')
print('Final Result: ')
print('Average Hamming Loss: '+str(sum(hamm_score)/len(hamm_score)))
print('Average Subset Accuracy: '+str(sum(subset_accu)/len(subset_accu)))
print('Average Macro F-score: '+str(sum(macro_f1)/len(macro_f1)))
print('Average Micro F-score: '+str(sum(micro_f1)/len(micro_f1)))
print('Average of Average Accuracy: '+str(sum(avg_accu)/len(avg_accu)))

k_fold validation: 1
(7650, 17402) (851, 17402)
(7650, 8) (851, 8)
Hamming Loss: 0.15804935370152762
Exact Prediction: 0.23266745005875442
Macro F-Score: 0.8905738777158825
Micro F-Score: 0.9083475298126065
Average Accuracy: 0.8419506462984724


k_fold validation: 2
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.16970588235294118
Exact Prediction: 0.2376470588235294
Macro F-Score: 0.885050084861993
Micro F-Score: 0.9013000342114267
Average Accuracy: 0.8302941176470588


k_fold validation: 3
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.16661764705882354
Exact Prediction: 0.20588235294117646
Macro F-Score: 0.8876694207992332
Micro F-Score: 0.9029550321199143
Average Accuracy: 0.8333823529411765


k_fold validation: 4
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.1625
Exact Prediction: 0.2376470588235294
Macro F-Score: 0.8886385325435959
Micro F-Score: 0.9053046533550432
Average Accuracy: 0.8374999999999999


k_fold validation: 5
(7651, 1740

In [9]:
labels = [i+1for i in range(10)]
labels.append('average')
hamm_score.append(sum(hamm_score)/len(hamm_score))
subset_accu.append(sum(subset_accu)/len(subset_accu))
macro_f1.append(sum(macro_f1)/len(macro_f1))
micro_f1.append(sum(micro_f1)/len(micro_f1))
avg_accu.append(sum(avg_accu)/len(avg_accu))

In [10]:
df_rfc = pd.DataFrame(list(zip(labels,hamm_score,subset_accu,macro_f1,micro_f1,avg_accu)),
              columns = ['k-fold','Hamming loss','Subset accuracy','Macro F-score','Micro F-score','Average Accuracy'])
df_rfc

Unnamed: 0,k-fold,Hamming loss,Subset accuracy,Macro F-score,Micro F-score,Average Accuracy
0,1,0.158049,0.232667,0.890574,0.908348,0.841951
1,2,0.169706,0.237647,0.88505,0.9013,0.830294
2,3,0.166618,0.205882,0.887669,0.902955,0.833382
3,4,0.1625,0.237647,0.888639,0.905305,0.8375
4,5,0.164559,0.231765,0.886207,0.904383,0.835441
5,6,0.162353,0.235294,0.889336,0.905818,0.837647
6,7,0.158235,0.221176,0.892537,0.907892,0.841765
7,8,0.165147,0.248235,0.886219,0.904074,0.834853
8,9,0.161618,0.250588,0.888992,0.905722,0.838382
9,10,0.168529,0.207059,0.884879,0.902068,0.831471


In [12]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
hamm_score = []
subset_accu = []
macro_f1 = []
micro_f1 = []
avg_accu = []
cnt = 1
for train_index,test_index in kf.split(X):
    x_train,x_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = np.array(Y.iloc[train_index].values.tolist()),np.array(Y.iloc[test_index].values.tolist())
    print("k_fold validation: " + str(cnt))
    cnt+=1
    
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    
    print(x_train.shape,x_test.shape)
    print(y_train.shape,y_test.shape)
    
    base = SVC()
    chain_rfc = ClassifierChain(base,order = 'random',random_state=0)
    chain_rfc.fit(x_train,y_train)
    y_pred = chain_rfc.predict(x_test)
    
    y_pred_val = y_pred
    y_test_val = np.array(y_test)
    
    score_list = y_test_val.T.tolist()
    predict_score_list = y_pred_val.T.tolist()
    
    ret = evaluation(score_list,predict_score_list)
    hamm_score.append(ret[0])
    subset_accu.append(ret[1])
    macro_f1.append(ret[2])
    micro_f1.append(ret[3])
    avg_accu.append(ret[4])
    print('\n')
print('Final Result: ')
print('Average Hamming Loss: '+str(sum(hamm_score)/len(hamm_score)))
print('Average Subset Accuracy: '+str(sum(subset_accu)/len(subset_accu)))
print('Average Macro F-score: '+str(sum(macro_f1)/len(macro_f1)))
print('Average Micro F-score: '+str(sum(micro_f1)/len(micro_f1)))
print('Average of Average Accuracy: '+str(sum(avg_accu)/len(avg_accu)))

k_fold validation: 1
(7650, 17402) (851, 17402)
(7650, 8) (851, 8)
Hamming Loss: 0.17332549941245592
Exact Prediction: 0.2573443008225617
Macro F-Score: 0.8646406009952573
Micro F-Score: 0.8972304476572026
Average Accuracy: 0.8266745005875441


k_fold validation: 2
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.17852941176470588
Exact Prediction: 0.2635294117647059
Macro F-Score: 0.8651785082012032
Micro F-Score: 0.8939923157527069
Average Accuracy: 0.8214705882352942


k_fold validation: 3
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.17720588235294119
Exact Prediction: 0.24352941176470588
Macro F-Score: 0.8669359807890754
Micro F-Score: 0.8942518648530058
Average Accuracy: 0.8227941176470588


k_fold validation: 4
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.17676470588235293
Exact Prediction: 0.2611764705882353
Macro F-Score: 0.867705061576679
Micro F-Score: 0.8948013302993174
Average Accuracy: 0.8232352941176471


k_fold validation: 5

In [13]:
labels = [i+1for i in range(10)]
labels.append('average')
hamm_score.append(sum(hamm_score)/len(hamm_score))
subset_accu.append(sum(subset_accu)/len(subset_accu))
macro_f1.append(sum(macro_f1)/len(macro_f1))
micro_f1.append(sum(micro_f1)/len(micro_f1))
avg_accu.append(sum(avg_accu)/len(avg_accu))

In [14]:
df_svc = pd.DataFrame(list(zip(labels,hamm_score,subset_accu,macro_f1,micro_f1,avg_accu)),
              columns = ['k-fold','Hamming loss','Subset accuracy','Macro F-score','Micro F-score','Average Accuracy'])
df_svc

Unnamed: 0,k-fold,Hamming loss,Subset accuracy,Macro F-score,Micro F-score,Average Accuracy
0,1,0.173325,0.257344,0.864641,0.89723,0.826675
1,2,0.178529,0.263529,0.865179,0.893992,0.821471
2,3,0.177206,0.243529,0.866936,0.894252,0.822794
3,4,0.176765,0.261176,0.867705,0.894801,0.823235
4,5,0.181912,0.242353,0.86195,0.891993,0.818088
5,6,0.172353,0.265882,0.869264,0.897606,0.827647
6,7,0.174412,0.245882,0.868205,0.896111,0.825588
7,8,0.169118,0.298824,0.873163,0.899633,0.830882
8,9,0.176324,0.264706,0.865742,0.894908,0.823676
9,10,0.176471,0.267059,0.866081,0.894995,0.823529
