In [1]:
import numpy as np
import pandas as pd
from senticnet5 import senticnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
from scipy import sparse
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_excel('hand8_k_random.xlsx')
print(len(df))

8501


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multioutput import ClassifierChain

In [4]:
X = df['Text']
Y = df[['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']]
Y = Y.values.tolist()

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Text'])
print(len(vectorizer.vocabulary_))

17402


In [6]:
col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.25,random_state = 42)

x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

y_train = np.array(y_train)
y_test = np.array(y_test)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(6375, 17402)
(6375, 8)
(2126, 17402)


In [7]:
def evaluation(score_list,predict_score_list):
    filter_corr = []
    exmatch = 0
    atleast1 = 0
    md1 = 0
    one_f = 0
    more_f = 0
    zero_f = 0
    sm = 0
    sdensity = 0
    hammval = 0
    test_len = len(predict_score_list[0])
    for j in range(test_len):
        cnt=0
        for i in range(8):
            hammval+=(score_list[i][j] ^ int(predict_score_list[i][j]))
            if(score_list[i][j]==1):
                cnt+=1
                sm+=1
        sdensity+=cnt/8
        if(cnt==0):
            zero_f+=1
        if(cnt==1):
            one_f+=1
        if(cnt>1):
            more_f+=1
        for i in range(8):
            mf = True
            if(int(predict_score_list[i][j])!=score_list[i][j]):
                mf=False
                break
        if(mf==True):
            exmatch+=1
            filter_corr.append(j)
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                atleast1+=1
                break
        mf = False
        for i in range(8):
            if(int(predict_score_list[i][j])==score_list[i][j] and score_list[i][j]==1):
                if(mf==True):
                    md1+=1
                    filter_corr.append(j)
                    break
                mf=True
    #print("Label Cardinality: "+ str(sm/test_len))
    #print("Label Density: "+ str(sdensity/test_len))
    print("Hamming Loss: "+str(hammval/(test_len*8)))
    print("Exact Prediction: "+str(exmatch/test_len))
    #print("At least one label predicted: "+str(atleast1/(test_len-zero_f)))
    #print("More than one label predicted: "+str(md1/more_f))
    tp_sum = 0
    fp_sum = 0
    fn_sum = 0
    macro_preci = 0
    macro_recall = 0
    macro_f1 = 0
    for i in range(len(score_list)):
        tmp = confusion_matrix(score_list[i],predict_score_list[i])
        tp_sum+=tmp[0][0]
        fp_sum+=tmp[0][1]
        fn_sum+=tmp[1][0]
        macro_preci_tmp=tmp[0][0]/(tmp[0][0]+tmp[0][1])
        macro_recall_tmp=tmp[0][0]/(tmp[0][0]+tmp[1][0])
        macro_f1 += ((2*macro_preci_tmp*macro_recall_tmp)/(macro_preci_tmp+macro_recall_tmp))
        macro_preci+=macro_preci_tmp
        macro_recall+=macro_recall_tmp
        print(macro_f1)
    micro_preci = tp_sum/(tp_sum+fp_sum)
    micro_recall = tp_sum/(tp_sum+fn_sum)
    micro_f1 = (2*micro_preci*micro_recall)/(micro_preci+micro_recall)
    macro_preci/=8
    macro_recall/=8
    macro_f1/=8
    #print(micro_preci,micro_recall,micro_f1)
    #print(macro_preci,macro_recall,macro_f1)
    print("Macro F-Score: "+str(macro_f1))
    print("Micro F-Score: "+str(micro_f1))
    col_names = ['Joy','Sadness','Anger','Disgust','Admiration','Surprise','Interest','Fear']
    tmp = 0
    for i in range(len(score_list)):
        score = accuracy_score(score_list[i],predict_score_list[i]) 
        print(col_names[i]+" accuracy: "+str(score))
        tmp += score
    print("Average Accuracy: " + str(tmp/8))

In [8]:
base = RandomForestClassifier()

chains = [ClassifierChain(base,order = 'random',random_state=i) for i in range(10)]
cnt=1
y_pred_list = []
for chain in chains:
    print("Training Chain "+ str(cnt))
    cnt+=1
    chain.fit(x_train,y_train)
    y_pred = chain.predict(x_test)
    y_pred_list.append(y_pred)

Training Chain 1
Training Chain 2
Training Chain 3
Training Chain 4
Training Chain 5
Training Chain 6
Training Chain 7
Training Chain 8
Training Chain 9
Training Chain 10


In [9]:
y_pred = []
threshold = 5
for i in range(len(y_pred_list[0])):
    tmp = []
    for j in range(len(y_pred_list[0][0])):
        tmp.append(0)
    y_pred.append(tmp)
for k in range(len(y_pred_list)):
    for i in range(len(y_pred)):
        for j in range(len(y_pred[0])):
            y_pred[i][j]+=y_pred_list[k][i][j]
for i in range(len(y_pred)):
    for j in range(len(y_pred[0])):
        if(y_pred[i][j]>=threshold):
            y_pred[i][j]=1
        else:
            y_pred[i][j]=0
y_pred_np = np.array(y_pred)
score_list = y_test.T.tolist()
predict_score_list = y_pred_np.T.tolist()
evaluation(score_list,predict_score_list)

Hamming Loss: 0.16521636876763876
Exact Prediction: 0.22718720602069614
0.8499413833528722
1.4899813234427375
2.433537766999181
3.3502472785673043
4.313932387024531
5.274558456968283
6.125529198012854
7.0831646659931495
Macro F-Score: 0.8853955832491437
Micro F-Score: 0.9038264083783968
Joy accuracy: 0.7591721542803387
Sadness accuracy: 0.660865475070555
Anger accuracy: 0.8936970837253058
Disgust accuracy: 0.8476011288805269
Admiration accuracy: 0.9299153339604892
Surprise accuracy: 0.9242709313264346
Interest accuracy: 0.7436500470366886
Fear accuracy: 0.9190968955785512
Average Accuracy: 0.8347836312323613


In [10]:
base = SVC()

chains = [ClassifierChain(base,order = 'random',random_state=i) for i in range(10)]
cnt=1
y_pred_list = []
for chain in chains:
    print("Training Chain "+ str(cnt))
    cnt+=1
    chain.fit(x_train,y_train)
    y_pred = chain.predict(x_test)
    y_pred_list.append(y_pred)

Training Chain 1
Training Chain 2
Training Chain 3
Training Chain 4
Training Chain 5
Training Chain 6
Training Chain 7
Training Chain 8
Training Chain 9
Training Chain 10


In [11]:
y_pred = []
threshold = 5
for i in range(len(y_pred_list[0])):
    tmp = []
    for j in range(len(y_pred_list[0][0])):
        tmp.append(0)
    y_pred.append(tmp)
for k in range(len(y_pred_list)):
    for i in range(len(y_pred)):
        for j in range(len(y_pred[0])):
            y_pred[i][j]+=y_pred_list[k][i][j]
for i in range(len(y_pred)):
    for j in range(len(y_pred[0])):
        if(y_pred[i][j]>=threshold):
            y_pred[i][j]=1
        else:
            y_pred[i][j]=0
y_pred_np = np.array(y_pred)
score_list = y_test.T.tolist()
predict_score_list = y_pred_np.T.tolist()
evaluation(score_list,predict_score_list)

Hamming Loss: 0.17421213546566322
Exact Prediction: 0.24788334901222953
0.8365650969529086
1.2962476366354483
2.2385074673970213
3.1553234387305253
4.119008547187752
5.079653866180664
5.928494324409219
6.885935898824964
Macro F-Score: 0.8607419873531205
Micro F-Score: 0.8963514884388007
Joy accuracy: 0.7502351834430856
Sadness accuracy: 0.5997177798682972
Anger accuracy: 0.8908748824082785
Disgust accuracy: 0.8471307619943556
Admiration accuracy: 0.9299153339604892
Surprise accuracy: 0.9242709313264346
Interest accuracy: 0.7455315145813735
Fear accuracy: 0.91862652869238
Average Accuracy: 0.8257878645343368
