In [1]:
#import libraries
import pandas as pd
import numpy as np
import dataPreparation as dp
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.svm import SVC
from math import log
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from scipy.spatial import distance
from sklearn.mixture import GaussianMixture
#from nltk.tokenize import word_tokenize
#from nltk import pos_tag
import nltk
from utils import *

Using TensorFlow backend.


In [2]:
#load dataset
#for this experiment we use Founta Dataset
Corpus=pd.read_csv('final_dataset.csv')
#shuffle the Corpus for better generalization
Corpus = shuffle(Corpus)
#remove stopwords before preprocess
stop = stopwords.words('english')


In [3]:
Corpus['text'] = Corpus['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#preprocess the text
processed_text=[]
for t in Corpus['text']:
    tokens = dp.DataPreparation.preprocess(t)
    processed_text.append(tokens)
Corpus['text']=processed_text

#Split the dataset into Train (10%), Test(20%) and Unlabel (70%)
train=Corpus[:1186]
test=Corpus[1186:2372]
gold_standard=Corpus[2372:]
unlabel=gold_standard.copy()
unlabel.drop(['label'],axis=1,inplace=True)
#save the gold standard
gold_standard.to_csv("result/gold_standard.csv",index=False)

In [4]:
#GMM without balancing
gmean=list()
scoring=list()
auc_score=list()
#we take 8 iteration as it simulize full 8 hours of working days of annotation
for iteration in range(0,8):
    print("Processing Annotation Iteration : ",iteration)
    train_texts = [textTokenizer(text) for text in list(train['text'])]
    train_labels = list(train['label'])
    test_texts = [textTokenizer(text) for text in list(test['text'])]
    test_labels = list(test['label'])
    unlabel_texts = [textTokenizer(text) for text in list(unlabel['text'])]

    #encode the label
    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(train['label'])
    y_test=Encoder.fit_transform(test['label'])

    #TFIDF vectorizer
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(Corpus['text'])
    Train_X_Tfidf = Tfidf_vect.transform(train_texts)
    Test_X_Tfidf = Tfidf_vect.transform(test_texts)
    Unlabel_X_Tfidf=Tfidf_vect.transform(unlabel_texts)

    
    unlabel_length=round(len(unlabel)*1)
    #Training and predict the label for unlabel dataset, save the low confidence label generated from the entropy
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)
    SVM.fit(Train_X_Tfidf,y_train)
    #predict the probability of class
    probs=SVM.predict_proba(Unlabel_X_Tfidf)
    #get the data and then calculate the uncertainty of data points
    unlabel_pred_data = list()
    for j, text, p in zip(unlabel['index'], list(unlabel['text']), probs):
        unlabel_pred_data.append([j, text, p[0], p[1]])
    unlabel_prob_df = pd.DataFrame(unlabel_pred_data, columns=["index", "text", "class_0", "class_1"])
    entropy=list()
    for p in probs:
        ent=0
        ent1=-p[0] * log(p[0],2)
        ent2=-p[1] * log(p[1],2)
        ent=ent1+ent2
        entropy.append(ent)
    unlabel_prob_df['entropy']=entropy
   
    #get 10% data to be clustered
    unlabel_prob_df=unlabel_prob_df.sort_values(by='entropy',ascending=False)
    #rank the lower uncertainty
   
    lower_uncertainty=unlabel_prob_df
    low_conf=lower_uncertainty[0:unlabel_length]
    

    #save the low conf for checking later
    #embedd the low conf text 
    
    #clustering based on uncertainty
    df=low_conf['entropy']
    uncertain=df.to_numpy()

    #initialize the GMM
    
    gmm = GaussianMixture(n_components=60)
    gmm.fit(uncertain.reshape(-1,1))
    labels=gmm.predict(uncertain.reshape(-1,1))
    similarity_to_center=list()
    for i, instance in enumerate(uncertain.reshape(-1,1)):
        cluster_label = labels[i]
        centroid = gmm.means_[cluster_label] # cluster center of the cluster of that instance

        similarity=distance.euclidean(instance,centroid)
        similarity_to_center.append(similarity)
    cluster_labels=pd.DataFrame({"index":low_conf['index'],"text":low_conf['text'],'cluster':labels,
                                'uncertainty':low_conf['entropy'],'similarity':similarity_to_center})
    #creating cluster_dict
    cluster_dict=dict()
    #creating the dataframe for each cluster
    for item in range(0,60):
        cluster_dict['cluster_{0}'.format(item)]=cluster_labels[cluster_labels['cluster']==item] 
    #Create the dataframe for each cluster
    for i in range(0,60):
            globals()['cluster_{}'.format(i)] = pd.DataFrame(cluster_dict['cluster_{}'.format(i)])
    for i in range(0,60):
        globals()['cluster_{}'.format(i)]=globals()['cluster_{}'.format(i)].sort_values(by="similarity",ascending=True)
    center_data=pd.DataFrame()
    
    #Tale of Tails
    for i in range(0,60):
        data1=globals()['cluster_{}'.format(i)]
        mn=data1['similarity'].median()
        first=round(len(data1)*0.05)
        first_data=data1.iloc[first,4]
        second=round(len(data1)*0.95)-1
        second_data=data1.iloc[second,4]
        cntr1=data1[data1['similarity']<=mn]
        ot1=data1[data1['similarity']<=first_data]
        ot2=data1[data1['similarity']<=second_data]
        cntr1=cntr1.head(1)
        ot1=ot1.head(1)
        ot2=ot2.head(1)
        frame=[cntr1,ot1,ot2]
        center_data=center_data.append(frame)
    

    #We imitate the labeling by having an oracle from the true label data
    #get the labeling
    true_label=pd.read_csv("result/gold_standard.csv")
    #get the label
    new_train=true_label.merge(center_data,on='index',how='left')
    #Drop NaN
    new_train=new_train.dropna()
    
    #sort by cluster
    new_train.sort_values(by=['cluster'],inplace=True)
   
    #Drop duplicate columns
    new_train.drop(['cluster','text_y','similarity'],axis=1,inplace=True)
    new_train.rename(columns={"text_x": "text"},inplace=True)
    
    #Get 50 of majority and minority class
    training_data=pd.DataFrame({'index':new_train['index'],
                               'label':new_train['label'],'text':new_train['text']})
    
    #calculating performance
    train=train.append(training_data)
    train.to_csv("training/training.csv",index=False)
    test.to_csv("training/test.csv",index=False)

    #Drop sample from unlabel pool
    unlabel=unlabel[~unlabel['index'].isin(new_train['index'])]
        

    train=pd.read_csv("training/training.csv")
    test=pd.read_csv("training/test.csv")
    #calculate the class weight 
    
    minority_len=len(train[train['label']==1])+len(new_train[new_train['label']==1])
    majority_len=len(train[train['label']==0])+len(new_train[new_train['label']==0])
    
    minority_weight=majority_len/minority_len
    majority_weight=minority_len/minority_len
    
    train.dropna(inplace=True)
    test.dropna(inplace=True)

    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(train['label'])
    y_test = Encoder.fit_transform(test['label'])
    X_train=train['text']
    X_test=test['text']

    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(train['text'])
    Train_X_Tfidf = Tfidf_vect.transform(X_train)
    Test_X_Tfidf = Tfidf_vect.transform(X_test)
    SVM = svm.SVC(C=1, kernel='linear', gamma='auto',probability=True,class_weight={0:majority_weight,1:minority_weight})
    SVM.fit(Train_X_Tfidf,y_train)
    # predict the labels on validation dataset and calculate the GMean Score
    predictions_SVM = SVM.predict(Test_X_Tfidf)
    score=geometric_mean_score(y_test, predictions_SVM)
    auc=roc_auc_score(y_test,predictions_SVM)
    scoring.append(score)
    auc_score.append(auc)

print("Finished")

Processing Annotation Iteration :  0
Processing Annotation Iteration :  1
Processing Annotation Iteration :  2
Processing Annotation Iteration :  3
Processing Annotation Iteration :  4
Processing Annotation Iteration :  5
Processing Annotation Iteration :  6
Processing Annotation Iteration :  7
Finished


In [5]:
#print the Gmean Result From the experiment
scoring

[0.6103832317821816,
 0.6209022998890171,
 0.6335432003761389,
 0.662644425178078,
 0.6537233233193875,
 0.6635556945866374,
 0.6217569490201733,
 0.6326755306692329]