In [1]:
import numpy as np
import pandas as pd
from senticnet5 import senticnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from scipy import sparse
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_excel('hand8_k_random.xlsx')
print(len(df))

8501


In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import ClassifierChain
from skmultilearn.ensemble import RakelD
from skmultilearn.adapt import MLkNN

In [4]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
X = df['Text']
Y = df[['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']]

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Text'])
print(len(vectorizer.vocabulary_))

17402


In [6]:
kf = KFold(n_splits = 10)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [7]:
def evaluation(score_list,predict_score_list):
    filter_corr = []
    exmatch = 0
    atleast1 = 0
    md1 = 0
    one_f = 0
    more_f = 0
    zero_f = 0
    sm = 0
    sdensity = 0
    hammval = 0
    test_len = len(predict_score_list[0])
    for j in range(test_len):
        cnt=0
        for i in range(8):
            hammval+=(score_list[i][j] ^ int(predict_score_list[i][j]))
            if(score_list[i][j]==1):
                cnt+=1
                sm+=1
        sdensity+=cnt/8
        if(cnt==0):
            zero_f+=1
        if(cnt==1):
            one_f+=1
        if(cnt>1):
            more_f+=1
        for i in range(8):
            mf = True
            if(int(predict_score_list[i][j])!=score_list[i][j]):
                mf=False
                break
        if(mf==True):
            exmatch+=1
            filter_corr.append(j)
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                atleast1+=1
                break
        mf = False
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                if(mf==True):
                    md1+=1
                    filter_corr.append(j)
                    break
                mf=True
    #print("Label Cardinality: "+ str(sm/test_len))
    #print("Label Density: "+ str(sdensity/test_len))
    print("Hamming Loss: "+str(hammval/(test_len*8)))
    hamlos = hammval/(test_len*8)
    print("Exact Prediction: "+str(exmatch/test_len))
    sub_accu = exmatch/test_len
    #print("At least one label predicted: "+str(atleast1/(test_len-zero_f)))
    #print("More than one label predicted: "+str(md1/more_f))
    tp_sum = 0
    fp_sum = 0
    fn_sum = 0
    macro_preci = 0
    macro_recall = 0
    macro_f1 = 0
    for i in range(len(score_list)):
        tmp = confusion_matrix(score_list[i],predict_score_list[i])
        tp_sum+=tmp[0][0]
        fp_sum+=tmp[0][1]
        fn_sum+=tmp[1][0]
        macro_preci_tmp=tmp[0][0]/(tmp[0][0]+tmp[0][1])
        macro_recall_tmp=tmp[0][0]/(tmp[0][0]+tmp[1][0])
        macro_f1 += ((2*macro_preci_tmp*macro_recall_tmp)/(macro_preci_tmp+macro_recall_tmp))
        macro_preci+=macro_preci_tmp
        macro_recall+=macro_recall_tmp
        #print(macro_f1)
    micro_preci = tp_sum/(tp_sum+fp_sum)
    micro_recall = tp_sum/(tp_sum+fn_sum)
    micro_f1 = (2*micro_preci*micro_recall)/(micro_preci+micro_recall)
    macro_preci/=8
    macro_recall/=8
    macro_f1/=8
    #print(micro_preci,micro_recall,micro_f1)
    #print(macro_preci,macro_recall,macro_f1)
    print("Macro F-Score: "+str(macro_f1))
    print("Micro F-Score: "+str(micro_f1))
    col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
    tmp = 0
    for i in range(len(score_list)):
        score = accuracy_score(score_list[i],predict_score_list[i]) 
        #print(col_names[i]+" accuracy: "+str(score))
        tmp += score
    print("Average Accuracy: " + str(tmp/8))
    avg_accu = tmp/8
    return (hamlos,sub_accu,macro_f1,micro_f1,avg_accu)

In [8]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
hamm_score = []
subset_accu = []
macro_f1 = []
micro_f1 = []
avg_accu = []
cnt = 1
for train_index,test_index in kf.split(X):
    x_train,x_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = np.array(Y.iloc[train_index].values.tolist()),np.array(Y.iloc[test_index].values.tolist())
    print("k_fold validation: " + str(cnt))
    cnt+=1
    
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    
    print(x_train.shape,x_test.shape)
    print(y_train.shape,y_test.shape)
    
    classifier = MLkNN(k=100)
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    
    y_pred_val = y_pred.toarray()
    y_test_val = np.array(y_test)
    
    score_list = y_test_val.T.tolist()
    predict_score_list = y_pred_val.T.tolist()
    
    ret = evaluation(score_list,predict_score_list)
    hamm_score.append(ret[0])
    subset_accu.append(ret[1])
    macro_f1.append(ret[2])
    micro_f1.append(ret[3])
    avg_accu.append(ret[4])
    print('\n')
print('Final Result: ')
print('Average Hamming Loss: '+str(sum(hamm_score)/len(hamm_score)))
print('Average Subset Accuracy: '+str(sum(subset_accu)/len(subset_accu)))
print('Average Macro F-score: '+str(sum(macro_f1)/len(macro_f1)))
print('Average Micro F-score: '+str(sum(micro_f1)/len(micro_f1)))
print('Average of Average Accuracy: '+str(sum(avg_accu)/len(avg_accu)))

k_fold validation: 1
(7650, 17402) (851, 17402)
(7650, 8) (851, 8)




Hamming Loss: 0.1649529964747356
Exact Prediction: 0.19858989424206816
Macro F-Score: 0.8878776319856899
Micro F-Score: 0.9040744853506448
Average Accuracy: 0.8350470035252644


k_fold validation: 2
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.1698529411764706
Exact Prediction: 0.21529411764705883
Macro F-Score: 0.8860076788586857
Micro F-Score: 0.9008498583569406
Average Accuracy: 0.8301470588235295


k_fold validation: 3
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.1711764705882353
Exact Prediction: 0.2011764705882353
Macro F-Score: 0.8859350828572398
Micro F-Score: 0.8997416020671835
Average Accuracy: 0.8288235294117646


k_fold validation: 4
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.16426470588235295
Exact Prediction: 0.21058823529411766
Macro F-Score: 0.8895947652068781
Micro F-Score: 0.9041941847499785
Average Accuracy: 0.835735294117647


k_fold validation: 5
(7651, 17402) (850, 17402)
(7651, 8) (850, 8)
Hamming Loss: 0.16691

In [9]:
labels = [i+1for i in range(10)]
labels.append('average')
hamm_score.append(sum(hamm_score)/len(hamm_score))
subset_accu.append(sum(subset_accu)/len(subset_accu))
macro_f1.append(sum(macro_f1)/len(macro_f1))
micro_f1.append(sum(micro_f1)/len(micro_f1))
avg_accu.append(sum(avg_accu)/len(avg_accu))

In [10]:
df_res = pd.DataFrame(list(zip(labels,hamm_score,subset_accu,macro_f1,micro_f1,avg_accu)),
              columns = ['k-fold','Hamming loss','Subset accuracy','Macro F-score','Micro F-score','Average Accuracy'])
df_res

Unnamed: 0,k-fold,Hamming loss,Subset accuracy,Macro F-score,Micro F-score,Average Accuracy
0,1,0.164953,0.19859,0.887878,0.904074,0.835047
1,2,0.169853,0.215294,0.886008,0.90085,0.830147
2,3,0.171176,0.201176,0.885935,0.899742,0.828824
3,4,0.164265,0.210588,0.889595,0.904194,0.835735
4,5,0.166912,0.202353,0.888327,0.903083,0.833088
5,6,0.165294,0.202353,0.891722,0.90403,0.834706
6,7,0.165441,0.204706,0.888966,0.903242,0.834559
7,8,0.165882,0.216471,0.889947,0.903491,0.834118
8,9,0.1675,0.223529,0.887789,0.902307,0.8325
9,10,0.165294,0.2,0.888944,0.903569,0.834706
