<a href="https://colab.research.google.com/github/shaina-ashraf/Author-Profiling-Human-Bot-Gender-classification-/blob/main/cc_Arguments_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
from __future__ import absolute_import
from __future__ import print_function

from copy import copy
import sklearn
import bz2
import pickle
import unicodedata
import seaborn as sns

import os
import sys
import keras
import numpy as np
import keras.backend as keras_backend

from os import listdir
from xml.etree import ElementTree
from keras import Input
from keras.layers import merge, Lambda, Activation, Convolution1D
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.models import Sequential
from keras.preprocessing import sequence as sequence_module
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import BatchNormalization

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.metrics.confusionmatrix import ConfusionMatrix
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk import RegexpParser

from theano.scalar import float32




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Tokenization and Words Embeddings


In [None]:
def sentence_tokenizer(text):
  return sent_tokenize(text)

def generate_pos_tags(text, patterns=""):

  tokens_tag = pos_tag(text.split())
  return tokens_tag

def tokenize(s):
    sentence_splitter = TweetTokenizer()
    tokens = sentence_splitter.tokenize(s)
    result = []
    for word in tokens:
        result.append(unicodedata.normalize('NFKD', word))
    return result


def load_embeddings(saved_embeddings):
    
    (frequencies, word_embedding_map) = pickle.load(bz2.BZ2File(saved_embeddings, 'r'), encoding='latin1')
    return frequencies, word_embedding_map


def dictionary_and_embeddings_to_indices(word_frequencies, embeddings):
    """
    Sort words by frequency, adds offset (3 items), maps word indices to embeddings and generate embeddings
    for padding, start of sequence, and OOV
    :param word_frequencies: dict (word: frequency)
    :param embeddings: dict (word: embeddings array)
    :return: word_to_indices_map (word: index), word_index_to_embeddings_map (index: embeddings)
    """

    # sort word frequencies from the most common ones
    sorted_word_frequencies_keys = sorted(word_frequencies, key=word_frequencies.get, reverse=True)

    word_to_indices_map = dict()
    word_index_to_embeddings_map = dict()

    # offset for all words so their indices don't start with 0
    # 0 is reserved for padding
    # 1 is reserved for start of sequence
    # 2 is reserved for OOV
    offset = 3
    
    # we also need to initialize embeddings for 0, 1, and 2
    # what is the dimension first?
    embedding_dimension = len(list(embeddings.values())[0])

    # for padding we add all zeros
    vector_padding = [0.0] * embedding_dimension

    # for start of sequence and OOV we add random vectors
    vector_start_of_sequence = 2 * 0.1 * np.random.rand(embedding_dimension) - 0.1
    vector_oov = 2 * 0.1 * np.random.rand(embedding_dimension) - 0.1

    # and add them to the embeddings map
    word_index_to_embeddings_map[0] = vector_padding
    word_index_to_embeddings_map[1] = vector_start_of_sequence
    word_index_to_embeddings_map[2] = vector_oov

    # iterate with index
    for idx, word in enumerate(sorted_word_frequencies_keys):
        # print idx, word

        new_index = idx + offset

        # update maps
        word_to_indices_map[word] = new_index

        if embeddings.get(word) is not None:
            word_index_to_embeddings_map[new_index] = embeddings.get(word)
        else:
            # fix embedding entries which are None with OOV vector
            word_index_to_embeddings_map[new_index] = vector_oov

    return word_to_indices_map, word_index_to_embeddings_map


def load_wordembedding_file(serialized_file='/content/gdrive/MyDrive/WMCA/data/vocabulary.embeddings.all.pkl.bz2'):
    # load
    print("Load ALL.. ****")
    print(serialized_file)
    freq, embeddings_map = load_embeddings(serialized_file)
    word_to_indices_map, word_index_to_embeddings_map = dictionary_and_embeddings_to_indices(freq, embeddings_map)
    return word_to_indices_map, word_index_to_embeddings_map

# loading Data from csv files

In [None]:
def load_csv(directory, file_name, word_to_indices_map, max_words=None, reduced_label_set=False):

    file = open(directory +"/"+ file_name, 'r')
    lines = file.readlines()
    del lines[0]

    x_vectors = []
    y_labels = []
    id_vector = []

    for line in lines:
        print("********* START ***********")
        line_split_list = line.split('\t')
        if len(line_split_list) >= 4:
          arg_id, label, arg_1, arg_2 = line_split_list
        else:
          continue

        # print(arg_id +"-",label+"->>>", arg_1+"->>>", arg_2+"-")

        id_vector.append(arg_id)
        arg_1_tokens = tokenize(arg_1)
        arg_2_tokens = tokenize(arg_2)

        arg_1_indices = [word_to_indices_map.get(word, 2) for word in arg_1_tokens]
        arg_2_indices = [word_to_indices_map.get(word, 2) for word in arg_2_tokens]

        arg_1_sent_tokens = sentence_tokenizer(arg_1)
        arg_2_sent_tokens = sentence_tokenizer(arg_2)

        arg_1_len = len(arg_1)
        arg_2_len = len(arg_2)

        arg_1_pos_tags = generate_pos_tags(arg_1)
        arg_2_pos_tags = generate_pos_tags(arg_2)


        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x = [1] + arg_1_indices + [1] + arg_2_indices

        # map class to vector
        all_labels = ["o5_1", "o5_2", "o5_3", "o6_1", "o6_2", "o6_3", "o7_1", "o7_2", "o7_3", "o7_4", "o8_1", "o8_4",
                      "o8_5", "o9_1", "o9_2", "o9_3", "o9_4"]
        if reduced_label_set:
            all_labels = ["o5", "o6", "o7"]


        # zeros vector y
        y = np.zeros(len(all_labels))
        # split label by comma

        for l in label.split(','):
            print("Label " + l)
            sup_label = l.split('_')[0]
            print("SupLabel " + sup_label)
            if l in all_labels:
              index_in_labels = all_labels.index(l)
            # and set to one
              y[index_in_labels] = 1

        print('Y vector: ', y, 'for class', label)

        x_vectors.append(x)
        y_labels.append(y)
        print("********* END ***********")

    # replace all word indices larger than nb_words with OOV
    if max_words:
        x_vectors = [[2 if word_index >= max_words else word_index for word_index in x] for x in x_vectors]

    train_instances = x_vectors
    train_labels = y_labels

    return train_instances, train_labels, id_vector


def load_csv_data(directory, test_split=0.2, max_words=None, reduced_label_set=False):
    files = listdir(directory)
    folders = dict()
    for file_name in files:
        training_file_names = copy(files)
        training_file_names.remove(file_name)
        folders[file_name] = {"training": training_file_names, "test": file_name}

    word_to_indices_map, word_index_to_embeddings_map = load_wordembedding_file()


    # results: map with fold_name (= file_name) and two tuples: (train_x, train_y), (test_x, test_y)
    output_folder_with_train_test_data = dict()

    # load all data first
    all_loaded_files = dict()
    for file_name in folders.keys():
        test_instances, test_labels, ids = load_csv(directory, file_name, word_to_indices_map, max_words,
                                                            reduced_label_set)
        all_loaded_files[file_name] = test_instances, test_labels, ids
    print("Loaded", len(all_loaded_files), "files")

    # parse each csv file in the directory
    for file_name in folders.keys():
        
        output_folder_with_train_test_data[file_name] = dict()
        current_folder = output_folder_with_train_test_data[file_name]
        test_instances, test_labels, ids, = all_loaded_files.get(file_name)
        current_folder["test"] = test_instances, test_labels, ids

        # now collect all training instances
        all_training_instances = []
        all_training_labels = []
        all_training_ids = []
        for training_file_name in folders.get(file_name)["training"]:
            training_instances, training_labels, ids = all_loaded_files.get(training_file_name)
            all_training_instances.extend(training_instances)
            all_training_labels.extend(training_labels)
            all_training_ids.extend(ids)

        current_folder["training"] = all_training_instances, all_training_labels, all_training_ids

    return output_folder_with_train_test_data, word_index_to_embeddings_map




In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from sklearn.metrics import confusion_matrix
def get_label_from_vector(vector):
    all_labels = ["o5_1", "o5_2", "o5_3", "o6_1", "o6_2", "o6_3", "o7_1", "o7_2", "o7_3", "o7_4", "o8_1", "o8_4","o8_5", "o9_1", "o9_2", "o9_3", "o9_4"]
    max_value_index = np.argmax(np.array(vector))
    return all_labels[max_value_index]

def get_model(X_train, y_train, embeddings, batch_size, epochs, max_len, max_features, output_vals=17):

    print("___ Deveoloping Nueral Network Architecture__")
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(max_len,)))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(output_vals))
    model.add(Activation('softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])
    model.fit(X_train,y_train, batch_size, epochs, 0.1,0)
    return model


def run__main__():
    np.random.seed(1000)  # for reproducibility
    max_features = 20000
    max_len = 300 # cut texts after this number of words (among top max_features most common words)
    batch_size = 20
    epochs = 5  # 5 epochs are meaningful to prevent over-fitting...

    input_folder =  '/content/gdrive/MyDrive/WMCA/data/CSV-format'
    input_folder = 'https://github.com/UKPLab/emnlp2016-empirical-convincingness/tree/master/data/CSV-format'

    folders, word_index_to_embeddings_map = load_csv_data(input_folder, max_words=max_features)

    all_folders_org = []
    all_folders_predicted = []

    final_results = {}
    for folder in folders.keys():

        x_matrix_train, y_matrix_train, ids_train = folders.get(folder)["training"]
        x_matrix_test, y_matrix_test, ids_test = folders.get(folder)["test"]

        embeddings = np.asarray([np.array(x, dtype=float32) for x in word_index_to_embeddings_map.values()])
        # print("Pad sequences (samples x time)")
        x_matrix_train = sequence_module.pad_sequences(x_matrix_train, maxlen=max_len)
        x_matrix_test = sequence_module.pad_sequences(x_matrix_test, maxlen=max_len)
        # print('x_matrix_train shape:', x_matrix_train.shape)
        # print('x_matrix_test shape:', x_matrix_test.shape)
        y_matrix_train = np.array(y_matrix_train)
        y_matrix_test = np.array(y_matrix_test)

        
        print('**** GETTING MODEL ***')
        model = get_model(x_matrix_train, y_matrix_train, embeddings, batch_size, epochs, max_len,
                          max_features)

        print('**** Prediction ****')
        model_predict = model.predict(x_matrix_test, batch_size=batch_size)
        predicted_labels = np.round(np.array(model_predict))

        # collect wrong predictions
        wrong_predictions_ids = []

        # hamming loss
        hamming_loss = sklearn.metrics.hamming_loss(y_matrix_test, predicted_labels)
        # one-error
        # most probable single prediction
        one_error_raw = 0.0

        file_resutlts = {}
        for i, (a, b) in enumerate(zip(y_matrix_test, predicted_labels)):
            max_value_index = np.argmax(np.array(a))
            one_error_match = np.round(b)[max_value_index] == np.round(a)[max_value_index]
            if one_error_match:
                one_error_raw += 1.0
        # value
        one_error = one_error_raw / np.array(y_matrix_test).shape[0]
        file_resutlts['OneError'] = one_error
        file_resutlts['HummingLoss'] = hamming_loss

        print("One error:", one_error, folder)
        print("Hamming loss:", hamming_loss, folder)

        for i, (a, b) in enumerate(zip(y_matrix_test, predicted_labels)):
            label_org = get_label_from_vector(a)
            label_predicted = get_label_from_vector(b)
            all_folders_org.append(label_org)
            all_folders_predicted.append(label_predicted)

            if a.any() != b.any():
                wrong_predictions_ids.append(ids_test[i])
        accuracy = keras.metrics.Accuracy()
        acc = accuracy(y_matrix_test, predicted_labels)
        file_resutlts['Accuracy'] = acc.numpy()
        print('Test accuracy:', acc.numpy())
        print('Wrong predictions:', wrong_predictions_ids)
        
        final_results[folder] = file_resutlts
    cm = ConfusionMatrix(all_folders_org, all_folders_predicted)
    

    print(cm)
    
    # cf_matrix = confusion_matrix(y_matrix_test, predicted_labels)
    # sns.heatmap(cf_matrix)

    print(final_results)


# Run Main

In [None]:
run__main__()

FileNotFoundError: ignored

# Flaw Detection


In [None]:
def get_label_from_vector(vector):
    all_labels = ["o5_1", "o5_2", "o5_3", "o6_1", "o6_2", "o6_3", "o7_1", "o7_2", "o7_3", "o7_4"]
    max_value_index = np.argmax(np.array(vector))
    return all_labels[max_value_index]


def get_model(X_train, y_train, embeddings, batch_size, nb_epoch, max_len, max_features, nb_classes):
    
  # get correct loss
  loss_function = 'categorical_crossentropy'

  model = Sequential()
  model.add(Dense(512, input_shape=(max_len,)))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(nb_classes))
  model.add(Activation('softmax'))
  model.compile(loss=loss_function, optimizer='adam', metrics=['accuracy'])

  model.fit(X_train, y_train, epochs=nb_epoch, batch_size=batch_size, validation_split=0.1, verbose=0)

  return model

In [None]:
def __main__():
    np.random.seed(1337)  # for reproducibility
    max_features = 20000
    max_len = 300  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32
    nb_epoch = 5  # 5 epochs are meaningful to prevent over-fitting...
    nb_classes = 3

    output_folder = "'/content/gdrive/MyDrive/WMCA/data/sdata/results.txt"
    input_folder =  '/content/gdrive/MyDrive/WMCA/data/CSV-format'

    folders, word_index_to_embeddings_map = load_csv_data(input_folder, max_words=max_features, reduced_label_set=True)

    all_folders_org = []
    all_folders_predicted = []
    all_output_id_pred_lines = []

    final_results = {}
    # print statistics
    for folder in folders.keys():
        print("Fold name ", folder)
        x_matrix_train, y_matrix_train, ids_train = folders.get(folder)["training"]
        x_matrix_test, y_matrix_test, ids_test = folders.get(folder)["test"]

        # converting embeddings to numpy 2d array: shape = (vocabulary_size, 300)
        embeddings = np.asarray([np.array(x, dtype=np.float32) for x in word_index_to_embeddings_map.values()])

        print(len(x_matrix_train), 'train sequences')
        print(len(x_matrix_test), 'test sequences')

        print("Pad sequences (samples x time)")
        x_matrix_train = sequence_module.pad_sequences(x_matrix_train, maxlen=max_len)
        x_matrix_test = sequence_module.pad_sequences(x_matrix_test, maxlen=max_len)
        print('x_matrix_train shape:', x_matrix_train.shape)
        print('x_matrix_test shape:', x_matrix_test.shape)

        y_matrix_test = np.array(y_matrix_test)
        y_matrix_train = np.array(y_matrix_train)

        # convert class vectors to binary class matrices

        model = get_model(x_matrix_train, y_matrix_train, embeddings, batch_size, nb_epoch, max_len,
                          max_features, nb_classes)

        print('Prediction')
        model_predict = model.predict(x_matrix_test, batch_size=batch_size)
        predicted_labels = np.round(np.array(model_predict))

        # collect wrong predictions
        wrong_predictions_ids = []

        hamming_loss = sklearn.metrics.hamming_loss(y_matrix_test, predicted_labels)
        # one-error
        # most probable single prediction
        one_error_raw = 0.0
        file_resutlts = {}
        for i, (a, b) in enumerate(zip(y_matrix_test, predicted_labels)):
            label_org = get_label_from_vector(a)
            label_predicted = get_label_from_vector(b)
            all_folders_org.append(label_org)
            all_folders_predicted.append(label_predicted)
            
            max_value_index = np.argmax(np.array(a))
            one_error_match = np.round(b)[max_value_index] == np.round(a)[max_value_index]
            if one_error_match:
                one_error_raw += 1.0
            instance_id = ids_test[i]

            if label_org != label_predicted:
                wrong_predictions_ids.append(instance_id)

            all_output_id_pred_lines.append(str(instance_id) + '\t' + label_org + '\t' + label_predicted)

            file_resutlts['HummingLoss'] = hamming_loss    
            
        # value
        one_error = one_error_raw / np.array(y_matrix_test).shape[0]
        file_resutlts['OneError'] = one_error
             
        accuracy = keras.metrics.Accuracy()
        acc = accuracy(y_matrix_test, predicted_labels)
        file_resutlts['Accuracy'] = acc.numpy()
        print('Test accuracy:', acc.numpy())

        print('Wrong predictions:', wrong_predictions_ids)
        final_results[folder] = file_resutlts

    cm = ConfusionMatrix(all_folders_org, all_folders_predicted)
    print(cm.pretty_format())
    print(final_results)
    f = open(output_folder, 'w')
    for item in all_output_id_pred_lines:
        f.write("%s\n" % item)


In [None]:
__main__()