In [30]:
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import os
import json
import re
import csv
from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
import math
np.random.seed(0)

In [31]:
def load_propaganda_task2():
    '''
    output:[[sentence, label]]
    '''
    global_sentence_label = list()
    
    tesk_2_3_path = '../other_dict/tasks-2-3/train/'
    target_data_2_dirs = os.listdir(tesk_2_3_path)
    article_name_set = set()
    for file_name in target_data_2_dirs:
        article_name_set.add(file_name.split(".")[0])
    for index in list(article_name_set):
        text_name = tesk_2_3_path + index + '.txt'
        try:
            text = open(text_name, encoding='utf-8', mode='r')
        except:
            print('No such file or directory: ', text_name)
            continue
        label_name = tesk_2_3_path + index + '.task2.labels'
        try:
            label = open(label_name, encoding='utf-8', mode='r')
        except:
            print('No such file or directory: ', label_name)
            continue
        text = text.readlines()
        sentence_list = []
        for sentence in text:
            sentence_list.append(sentence)
        label = label.readlines()
        label_list = []
        for line in label:
            label_list.append(line)
        assert len(label_list)==len(sentence_list),"text name %s, len of text %d, \
                len of label %d"%(text_name, len(sentence_list), len(label_list))
        for index,sentence in enumerate(sentence_list):
            if "\tnon-propaganda" in label_list[index] and sentence.strip()!="":
                global_sentence_label.append([sentence.strip(),0])
            elif "\tpropaganda" in label_list[index] and sentence.strip()!="":
                global_sentence_label.append([sentence.strip(),1])
            else:
                continue
    return global_sentence_label
# load data
global_sentence_label = load_propaganda_task2()

In [6]:

pos_word_dict = dict()
neg_word_dict = dict()
pos_num = 0
neg_num = 0
for ele in global_sentence_label:
    if ele[1]==1:
        pos_num += 1
        word_list = set(ele[0].lower().split(" "))
        for word in word_list:
            if word in pos_word_dict:
                pos_word_dict[word] += 1
            else:
                pos_word_dict[word] = 1
    else:
        neg_num += 1
        word_list = set(ele[0].split(" "))
        for word in word_list:
            if word in neg_word_dict:
                neg_word_dict[word] += 1
            else:
                neg_word_dict[word] = 1
                

In [7]:
pos_word_set = set(pos_word_dict.keys())
neg_word_set = set(neg_word_dict.keys())
all_word_set = pos_word_set.union(neg_word_set)


In [10]:

pos_significant_word_set = set()
neg_significant_word_set = set()
for word in all_word_set:
    if word in pos_word_dict.keys():
        word_in_pos = pos_word_dict[word]
    else:
        word_in_pos = 0
    if word in neg_word_dict.keys():
        word_in_neg = neg_word_dict[word]
    else:
        word_in_neg = 0
        
    if word_in_pos + word_in_neg > 10:
        word_not_in_pos = pos_num - word_in_pos
        word_not_in_neg = neg_num - word_in_neg
        g, p, dof, expctd = chi2_contingency(np.array([[word_in_pos, word_not_in_pos],[word_in_neg, word_not_in_neg]]))
        if p < 0.05:
            if word_in_pos > word_in_neg:
                pos_significant_word_set.add(word)
            else:
                neg_significant_word_set.add(word)

In [11]:
print("num of pos significant words %d, num of neg significant word %d"%(len(pos_significant_word_set), len(neg_significant_word_set)))

num of pos significant words 491, num of neg significant word 792


In [12]:

target_word_name='nrc_emotion'
print("Get pos_"+target_word_name+" Data Begining ......")
nrc_word_dict = dict()
nrc_word_set=set()
if target_word_name == 'nrc_emotion':
        target_data = open('../other_dict/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', encoding='utf-8', mode='r')
        target_data = target_data.readlines()
        for line in target_data:
            line = line.strip('\n\r').lower().split('\t')
            if len(line)==3 and line[2]== "1" and line[0] in pos_significant_word_set:
                nrc_word_set.add(line[0])
        for word in nrc_word_set:
            nrc_word_dict[word] = 1            
print("len of pos nrm word dict is ", len(nrc_word_dict))
new_target_words_path = './result_fake_true_after_reduce_stopwords/true_' + target_word_name + '_words.csv'
with open(new_target_words_path, "w") as new_key_words:
    writer = csv.writer(new_key_words)
    for key in nrc_word_dict.keys():
        word = key
        try:
            word_weight = nrc_word_dict[word]
        except:
            word_weight = 'None'
        writer.writerow([word, word_weight])

target_word_name='nrc_emotion'
print("Get neg_"+target_word_name+" Data Begining ......")
nrc_word_dict = dict()
nrc_word_set=set()
if target_word_name == 'nrc_emotion':
        target_data = open('../other_dict/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', encoding='utf-8', mode='r')
        target_data = target_data.readlines()
        for line in target_data:
            line = line.strip('\n\r').lower().split('\t')
            if len(line)==3 and line[2]== "1" and line[0] in neg_significant_word_set:
                nrc_word_set.add(line[0])
        for word in nrc_word_set:
            nrc_word_dict[word] = 1            
print("len of neg nrm word dict is ", len(nrc_word_dict))
new_target_words_path = './result_fake_true_after_reduce_stopwords/fake_' + target_word_name + '_words.csv'
with open(new_target_words_path, "w") as new_key_words:
    writer = csv.writer(new_key_words)
    for key in nrc_word_dict.keys():
        word = key
        try:
            word_weight = nrc_word_dict[word]
        except:
            word_weight = 'None'
        writer.writerow([word, word_weight])

Get pos_nrc_emotion Data Begining ......
len of pos nrm word dict is  113
Get neg_nrc_emotion Data Begining ......
len of neg nrm word dict is  62


In [32]:
global_test_sentence_label = list()
sentence_test_list = []
label_test_list = []
with open('../other_dict/new_propaganda_test/task2test_rs.csv','r') as csvfile_test_test:
    reader_test = csv.reader(csvfile_test_test)
    for i,rows in enumerate(reader_test):
        row_test = rows
        text_test = row_test[3]
        label =row_test[2]
        if label=="non-propaganda":
            global_test_sentence_label.append([text_test,0])
        elif label=="propaganda":
            global_test_sentence_label.append([text_test,1])
        else:
            continue
# partition to positive/negative
global_test_sentence_label_positive = [pair for pair in global_test_sentence_label if pair[1]==1]
global_test_sentence_label_negative = [pair for pair in global_test_sentence_label if pair[1]==0]
print("num of sentence is %d"%(len(global_test_sentence_label)))
print("num of pos sentence is %d"%(len(global_test_sentence_label_positive)))
print("num of neg sentence is %d"%(len(global_test_sentence_label_negative)))
test_dataset = global_test_sentence_label_negative[:]
test_dataset.extend(global_test_sentence_label_positive[:])

num of sentence is 4032
num of pos sentence is 1239
num of neg sentence is 2793


In [33]:
def load_propaganda_task2():
    '''
    output:[[sentence, label]]
    '''
    global_sentence_label = list()
    
    tesk_2_3_path = '../other_dict/tasks-2-3/train/'
    target_data_2_dirs = os.listdir(tesk_2_3_path)
    article_name_set = set()
    for file_name in target_data_2_dirs:
        article_name_set.add(file_name.split(".")[0])
    for index in list(article_name_set):
        text_name = tesk_2_3_path + index + '.txt'
        try:
            text = open(text_name, encoding='utf-8', mode='r')
        except:
            print('No such file or directory: ', text_name)
            continue
        label_name = tesk_2_3_path + index + '.task2.labels'
        try:
            label = open(label_name, encoding='utf-8', mode='r')
        except:
            print('No such file or directory: ', label_name)
            continue
        text = text.readlines()
        sentence_list = []
        for sentence in text:
            sentence_list.append(sentence)
        label = label.readlines()
        label_list = []
        for line in label:
            label_list.append(line)
        assert len(label_list)==len(sentence_list),"text name %s, len of text %d, \
                len of label %d"%(text_name, len(sentence_list), len(label_list))
        for index,sentence in enumerate(sentence_list):
            if "\tnon-propaganda" in label_list[index] and sentence.strip()!="":
                global_sentence_label.append([sentence,0])
            elif "\tpropaganda" in label_list[index] and sentence.strip()!="":
                global_sentence_label.append([sentence,1])
            else:
                continue
    return global_sentence_label
# load data
global_sentence_label = load_propaganda_task2()
np.random.shuffle(global_sentence_label)
# partition to positive/negative
global_sentence_label_positive = [pair for pair in global_sentence_label if pair[1]==1]
global_sentence_label_negative = [pair for pair in global_sentence_label if pair[1]==0][:3938]
print("num of sentence is %d"%(len(global_sentence_label)))
print("num of pos sentence is %d"%(len(global_sentence_label_positive)))
print("num of neg sentence is %d"%(len(global_sentence_label_negative)))
train_dataset = global_sentence_label_negative[:]
train_dataset.extend(global_sentence_label_positive[:])

num of sentence is 14263
num of pos sentence is 3938
num of neg sentence is 3938


In [41]:
def load_dict(dict_path):
    word_set = set()
    dict_text = open(dict_path, encoding='utf-8', mode='r')
    for line in dict_text:
        if line.strip()!="":
            word =line.split(",")[0]
            word_set.add(word)
    return word_set
def word_num_in_dict(sentence, word_set):
    try:
        word_list = sentence.strip().split(" ")
    except:
        print(sentence)
    word_num = 0
    for word in word_list:
        if word in word_set:
            word_num += 1
    return word_num
def build_feature_matrix_and_label(dataset,dict_names,dict_path):
    num_sample = len(dataset)
    num_feature = len(dict_names)
    feature_matrix = np.zeros((num_sample, num_feature))
    label = np.zeros((num_sample,))
    for i in range(num_sample):
        for j in range(num_feature):
            if 'propaganda' in dict_names[j]:
                dict_path='./result_fake_true_week5/'
            else:
                dict_path="./result_fake_true_after_reduce_stopwords/"
            feature_matrix[i,j] = word_num_in_dict(dataset[i][0], load_dict(dict_path + dict_names[j] + "_words.csv"))
            
        label[i] = dataset[i][1]
    return feature_matrix,label
def build_feature_matrix_new_one_dict(dataset,dict_name,dict_path):
    if 'propaganda' in dict_name:
        dict_path='./result_fake_true_week5/'
    else:
        dict_path="./result_fake_true_after_reduce_stopwords/"
    word_list = list(load_dict(dict_path+dict_name+ "_words.csv"))
    num_sample = len(dataset)
    num_feature = len(word_list)
    feature_matrix = np.zeros((num_sample, num_feature))
    for i in range(len(dataset)):
        for j,word in enumerate(word_list):
            if word in dataset[i][0]:
                feature_matrix[i][j]=1
    return(feature_matrix)

dict_names = ['fake_persuasive','fake_sentiment','fake_subjectivity','fake_technical','fake_all',
              'true_persuasive','true_sentiment','true_subjectivity','true_technical','true_all']
dict_path = "./result_fake_true_after_reduce_stopwords/"
train_feature_matrix, train_label = build_feature_matrix_and_label(train_dataset, dict_names, dict_path)
test_feature_matrix, test_label = build_feature_matrix_and_label(test_dataset, dict_names, dict_path)
print('train_feature_matrix:',len(train_feature_matrix))
print('train_label:',len(train_label))

train_feature_matrix: 7876
train_label: 7876


In [42]:
# one-hot
for dict_name in dict_names:
    train_feature_matrix=np.hstack((build_feature_matrix_new_one_dict(train_dataset,dict_name,dict_path),train_feature_matrix))
    test_feature_matrix=np.hstack((build_feature_matrix_new_one_dict(test_dataset,dict_name,dict_path),test_feature_matrix))
    
print('train_feature_matrix:',train_feature_matrix)

train_feature_matrix: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [43]:
print("shape",train_feature_matrix.shape, train_label.shape, test_feature_matrix.shape, test_label.shape)

shape (7876, 1260) (7876,) (4032, 1260) (4032,)


In [29]:
#gbdt, train
gbdt = GradientBoostingClassifier(max_depth=4,
                                  random_state=0,
                                  min_samples_split=5,
                                  learning_rate=0.01,
                                  n_estimators=30,
                                  subsample=0.8)

sample_weight = np.zeros(train_label.shape,np.float)
sample_weight[train_label==1] = 1.0 / np.sum(train_label==1)
sample_weight[train_label==0] = 1.0 / np.sum(train_label==0)
rf = gbdt.fit(train_feature_matrix, train_label, sample_weight)

print("train================================")

val_score_rbf = gbdt.score(train_feature_matrix, train_label)#val_score_rbf = gbdt.score(x, y)
print("The train accuracy score of rf is : %f" % val_score_rbf)
predict_label = gbdt.predict(train_feature_matrix)
prob_y = gbdt.predict_proba(train_feature_matrix)[:,1]
cm = confusion_matrix(predict_label,train_label)
print("confusion matrix is ", cm)
#plot_confusion_matrix(cm, "GBDT Confusion Matrix")
print(" positive f1:",f1_score(train_label,predict_label,pos_label=True))
print(" negative f1:",f1_score(train_label,predict_label,pos_label=False))
#print('test_feature_matrix:',test_feature_matrix)
gbdt_auc = roc_auc_score(train_label, prob_y)
print('The AUC of GBDT: %.5f' % gbdt_auc)

print("test================================")
test_score_rbf = gbdt.score(test_feature_matrix, test_label)#test_score_rbf = gbdt.score(test_x, test_y)
print("The test accuracy score of rf is : %f" % test_score_rbf)
predict_label = gbdt.predict(test_feature_matrix)
prob_y = gbdt.predict_proba(test_feature_matrix)[:,1]
cm = confusion_matrix(predict_label,test_label)
print("confusion matrix is ", cm)
#plot_confusion_matrix(cm, "GBDT Confusion Matrix")


print(" positive f1:",f1_score(test_label,predict_label,pos_label=True))
print(" negative f1:",f1_score(test_label,predict_label,pos_label=False))

#print('test_feature_matrix:',test_feature_matrix)
gbdt_auc = roc_auc_score(test_label, prob_y)
print('The AUC of GBDT: %.5f' % gbdt_auc)

The train accuracy score of rf is : 0.607796
confusion matrix is  [[2927 2078]
 [1011 1860]]
 positive f1: 0.5463357321192539
 negative f1: 0.6545901822654591
The AUC of GBDT: 0.65401
The test accuracy score of rf is : 0.649554
confusion matrix is  [[2108  728]
 [ 685  511]]
 positive f1: 0.4197125256673511
 negative f1: 0.7489785041748092
The AUC of GBDT: 0.62034
