In [1]:
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import os
import json
import re
import csv

from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
import math
np.random.seed(0)

  from numpy.core.umath_tests import inner1d


In [2]:
# generate NELA feature for task1, see NELA project
# load task1 seperated data
def load_NELA_feature():
        feature_path = "../other_dict/tasks-2-3/train_harvest_tool_out/processed_data.csv"
        fileName_feature_dict = dict()
        with open(feature_path,'r') as f:
            for line in f.readlines():
                line_split = line.strip().split(",")
                fileName_feature_dict[line_split[0]] = [float(i) for i in line_split[3:]]
                #print('fileName_feature_dict:',fileName_feature_dict[line_split[0]])
        return fileName_feature_dict 
task1_NELA_dict = load_NELA_feature()

feature_label_list = []
for fileName,feature in task1_NELA_dict.items():
    feature_label = feature.append(fileName.split("_")[1]=="propaganda")
    feature_label_list.append(feature)
np.random.shuffle(feature_label_list)
# partition to positive/negative
global_sentence_label_positive = [pair for pair in feature_label_list if pair[-1]==True]
global_sentence_label_negative = [pair for pair in feature_label_list if pair[-1]==False][:3934]
print(len(feature_label_list),len(global_sentence_label_positive),len(global_sentence_label_negative))

train_dataset = global_sentence_label_negative[:int(len(global_sentence_label_negative)*0.9)]
train_dataset.extend(global_sentence_label_positive[:int(len(global_sentence_label_positive)*0.9)])
#print('global_sentence_label_negative:',global_sentence_label_negative[:2])
#print('global_sentence_label_positive:',global_sentence_label_positive[:2])
validation_dataset = global_sentence_label_negative[int(len(global_sentence_label_negative)*0.9):]
validation_dataset.extend(global_sentence_label_positive[int(len(global_sentence_label_negative)*0.9):])

#test_dataset = global_sentence_label_negative[int(len(global_sentence_label_negative)*0.9):]
#test_dataset.extend(global_sentence_label_positive[int(len(global_sentence_label_positive)*0.9):])



def convert_to_array(dataset):
    feature = np.array([line[:-1] for line in dataset])
    label = np.array([line[-1] for line in dataset])
    return feature,label
train_feature_matrix, train_label = convert_to_array(train_dataset)
validation_feature_matrix, validation_label = convert_to_array(validation_dataset)
#test_feature_matrix, test_label = convert_to_array(train_dataset)

14153 3934 3934


In [3]:

# generate NELA feature for task1, see NELA project
# load task1 seperated data
def load_NELA_test_feature():
        feature_test_path = "../other_dict/new_propaganda_test/train_harvest_tool_out/processed_data.csv"
        fileName_feature_test_dict = dict()
        with open(feature_test_path,'r') as f_test:
            for line in f_test.readlines():
                line_test_split = line.strip().split(",")
                fileName_feature_test_dict[line_test_split[0]] = [float(i) for i in line_test_split[3:]]
        return fileName_feature_test_dict 
task1_NELA_test_dict = load_NELA_test_feature()
feature_test_label_list = []
for fileName_test,feature_test in task1_NELA_test_dict.items():
    #print('fileName.split("_")[1]:',fileName.split("_")[2])
    feature_test_label = feature_test.append(fileName_test.split("_")[2]=="propaganda")
    feature_test_label_list.append(feature_test)
#print('feature_test_label_list:',feature_test_label_list[:5])

# partition to positive/negative
global_sentence_test_label_positive = [pair for pair in feature_test_label_list if pair[-1]==True]
global_sentence_test_label_negative = [pair for pair in feature_test_label_list if pair[-1]==False]
print(len(feature_test_label_list),len(global_sentence_test_label_positive),len(global_sentence_test_label_negative))
#print('global_sentence_test_label_positive:',global_sentence_test_label_positive[:2])
test_dataset = global_sentence_test_label_negative
test_dataset.extend(global_sentence_test_label_positive)

def convert_to_array(dataset):
    feature = np.array([line[:-1] for line in dataset])
    label = np.array([line[-1] for line in dataset])
    return feature,label

test_feature_matrix, test_label = convert_to_array(test_dataset)


4014 1235 2779


In [4]:
#gbdt, train
gbdt = GradientBoostingClassifier(max_depth=4,
                                  random_state=0,
                                  min_samples_split=5,
                                  learning_rate=0.01,
                                  n_estimators=30,
                                  subsample=0.8)
sample_weight = np.zeros(train_label.shape,np.float)
sample_weight[train_label==1] = 1.0 / np.sum(train_label==1)
sample_weight[train_label==0] = 1.0 / np.sum(train_label==0)
rf = gbdt.fit(train_feature_matrix, train_label, sample_weight)

print("train================================")

val_score_rbf = gbdt.score(train_feature_matrix, train_label)#val_score_rbf = gbdt.score(x, y)
print("The train accuracy score of rf is : %f" % val_score_rbf)
predict_label = gbdt.predict(train_feature_matrix)
prob_y = gbdt.predict_proba(train_feature_matrix)[:,1]
cm = confusion_matrix(predict_label,train_label)
print("confusion matrix is ", cm)
#plot_confusion_matrix(cm, "GBDT Confusion Matrix")
print(" positive f1:",f1_score(train_label,predict_label,pos_label=True))
print(" negative f1:",f1_score(train_label,predict_label,pos_label=False))
#print('test_feature_matrix:',test_feature_matrix)
gbdt_auc = roc_auc_score(train_label, prob_y)
print('The AUC of GBDT: %.5f' % gbdt_auc)


print("test================================")
test_score_rbf = gbdt.score(test_feature_matrix, test_label)#test_score_rbf = gbdt.score(test_x, test_y)
print("The test accuracy score of rf is : %f" % test_score_rbf)
predict_label = gbdt.predict(test_feature_matrix)
prob_y = gbdt.predict_proba(test_feature_matrix)[:,1]
#cm = confusion_matrix(predict_label,test_label)
#print("confusion matrix is ", cm)
#plot_confusion_matrix(cm, "GBDT Confusion Matrix")

from sklearn.metrics import precision_score, recall_score, f1_score


print(" positive f1:",f1_score(test_label,predict_label,pos_label=True))
print(" negative f1:",f1_score(test_label,predict_label,pos_label=False))
f1 = f1_score(test_label, predict_label, average='binary')
print("f1:", f1 )
p = precision_score(test_label, predict_label, average='binary')
print("precision:", p)
r = recall_score(test_label, predict_label,average='binary')
print("recall:", r)
#print('test_feature_matrix:',test_feature_matrix)
gbdt_auc = roc_auc_score(test_label, prob_y)
print('The AUC of GBDT: %.5f' % gbdt_auc)



The train accuracy score of rf is : 0.686582
confusion matrix is  [[2378 1057]
 [1162 2483]]
 positive f1: 0.6911621433542101
 negative f1: 0.6818637992831542
The AUC of GBDT: 0.74818
The test accuracy score of rf is : 0.667663
 positive f1: 0.5526492287055667
 negative f1: 0.735632183908046
f1: 0.5526492287055667
precision: 0.4716657126502576
recall: 0.6672064777327935
The AUC of GBDT: 0.72363
