# Find the unfair clauses

In [2]:
import pandas as pd
from numpy.random import RandomState
import numpy as np

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Embedding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import itertools

Using TensorFlow backend.
unable to import 'smart_open.gcs', disabling that module


In [68]:
class BaseClassifier(object):
    
    def __init__(self):
        self.df = pd.read_csv('Dataset.csv')

        self.train = self.df.sample(frac=0.8, random_state = RandomState())
        self.train = pd.read_csv('Train.csv')
        self.test = self.df.loc[~self.df.index.isin(self.train.index)]
        self.test = pd.read_csv('Test.csv')
        
        cat_label = self.df.Category.unique()
        dic_train = {}
        for i, c in enumerate(cat_label):
            dic_train[c] = i
        self.labels_train = self.train.Category.apply(lambda x:dic_train[x])
        self.labels_test = self.test.Category.apply(lambda x:dic_train[x])
        

        self.NUM_WORDS = 5000
        self.tokenizer = Tokenizer(num_words=self.NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)

        self.EMBEDDING_DIM = 300
        self.maxlen=150
    
    def text_to_sequence(self):
        self.tokenizer.fit_on_texts(self.train.Clause)
        sequences_train = self.tokenizer.texts_to_sequences(self.train.Clause)
        self.tokenizer.fit_on_texts(self.test.Clause)
        sequences_test = self.tokenizer.texts_to_sequences(self.test.Clause)
        word_index = self.tokenizer.word_index
        
        print('Found %s unique tokens.' % len(word_index))

        X_train = pad_sequences(sequences_train, padding='post', maxlen=self.maxlen)          
        y_train = to_categorical(np.asarray(self.labels_train[self.train.index]))
        print(X_train)
        
        X_test = pad_sequences(sequences_test, padding='post', maxlen=self.maxlen)
        y_test = to_categorical(np.asarray(self.labels_test[self.test.index]))  
        
        print('Shape of X train tensor:', X_train.shape)
        print('Shape of label train tensor:', y_train.shape)
        return X_train, y_train, X_test, y_test, word_index
    
    def sampling(self, X_train, y_train):
        oversampler = RandomOverSampler(random_state=42, sampling_strategy="minority")
        X_train_sampler, y_train_sampler = oversampler.fit_resample(X_train, y_train)
#         y_train_sampler = to_categorical(y_train_sampler, num_classes=10)
        print(y_train_sampler)
        
#         undersampler = RandomUnderSampler(sampling_strategy=0.5)
#         X_train_sampler, y_train_sampler = undersampler.fit_resample(X_train, y_train)
#         y_train_sampler = to_categorical(y_train_sampler)

#         smote = SMOTE('minority')
#         X_train_sampler, y_train_sampler = smote.fit_resample(X_train, y_train)
#         y_train_sampler = to_categorical(y_train_sampler)
        print(X_train_sampler.shape, y_train_sampler.shape)
                
        return X_train_sampler, y_train_sampler

    def word_to_vec(self, word_index):
        word_vectors = KeyedVectors.load_word2vec_format('w2v_model.bin', binary=True)

        vocab_size = min(len(word_index)+1, self.NUM_WORDS)
        embedding_matrix = np.zeros((vocab_size, self.EMBEDDING_DIM))

        for word, i in word_index.items():
            if i >= self.NUM_WORDS:
                continue
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25), self.EMBEDDING_DIM)

        del(word_vectors)

        embedding_layer = Embedding(vocab_size, self.EMBEDDING_DIM, weights=[embedding_matrix], trainable=True)
        return embedding_layer

    def build_model(self, embedding_layer):
        filter_sizes = [3, 3, 3, 3]
        num_filters = 64
        drop = 0.5

        inputs = Input(shape=(self.maxlen,))
        embedding = embedding_layer(inputs)
        reshape = Reshape((self.maxlen, self.EMBEDDING_DIM, 1))(embedding)

        conv_0 = Conv2D(num_filters, (filter_sizes[0], self.EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
        conv_1 = Conv2D(num_filters, (filter_sizes[1], self.EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
        conv_2 = Conv2D(num_filters, (filter_sizes[2], self.EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
        conv_3 = Conv2D(num_filters, (filter_sizes[3], self.EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

        maxpool_0 = MaxPooling2D((self.maxlen - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
        maxpool_1 = MaxPooling2D((self.maxlen - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
        maxpool_2 = MaxPooling2D((self.maxlen - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)
        maxpool_3 = MaxPooling2D((self.maxlen - filter_sizes[3] + 1, 1), strides=(1,1))(conv_3)

        merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3], axis=1)
        flatten = Flatten()(merged_tensor)
        reshape = Reshape((4*num_filters,))(flatten)
        dropout = Dropout(drop)(flatten)
        output = Dense(units=10, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
        return inputs, output

    def train_model(self, inputs, output, X_train, y_train):
        class_weight = {
            0 : 1.,
            1 : 1.,
            2 : 10.,
            3 : 3.,
            4 : 5.,
            5 : 16.,
            6 : 20.,
            7 : 11.,
            8 : 12.,
            9 : 100.
        }
        model = Model(inputs, output)
        adam = Adam(lr=1e-3)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
        callback_checkpoint = ModelCheckpoint("weights.h5", monitor='val_acc', verbose=1, save_best_only=True)
        callbacks = [
            EarlyStopping(patience=5, monitor='val_loss'),
            callback_checkpoint
        ]
        history = model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.25, callbacks=callbacks, class_weight=class_weight) 
        return history, model
    
    def evaluate_model(self, history, model, X_test, y_test):
        loss, acc = model.evaluate(X_test, y_test, verbose = 1, batch_size=1024)
        print("Test Accuracy: %f" % (acc*100))
        
        y_pred = model.predict(X_test)
        y_pred = y_pred.round()
        
        print(y_pred)
        print("\n")
    
        target_names = ['First Party Collection/Use', 'Third Party Sharing/Collection', 'Data Security', 'Other', 'User Choice/Control', 'Policy Change', 'Data Retention', 'User Access, Edit and Deletion', 'International and Specific Audiences', 'Do Not Track']
        
        print("Accuracy Score: ", (accuracy_score(y_test, y_pred))*100)
        print(classification_report(y_test, y_pred, target_names = target_names, zero_division='warn'))
        
        cnf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1), labels=[0, 1])
        np.set_printoptions(precision=2)
        plt.figure()
        self.plot_confusion_matrix(cnf_matrix, classes=target_names, title='Confusion matrix, without normalization')

        plt.figure(figsize=(20,10))

        plt.subplot(1, 2, 1)
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train','test'], loc='upper left')

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train','test'], loc='upper left')
        plt.show()
        
    def plot_confusion_matrix(self, cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        print(cm)

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()
        
    def predict(self, model, X_test, y_test):
        model.load_weights("weights.h5")
        predictions = model.predict(np.expand_dims(X_test[50], 0))
        print(self.tokenizer.sequences_to_texts([X_test[50]]))
        print(y_test[50])
        print(predictions)
   
    def getmaxlen(self): 
        return self.maxlen

In [69]:
def main():
    base_classifier = BaseClassifier()
    X_train, y_train, X_test, y_test, word_index = base_classifier.text_to_sequence()
    X_train, y_train = base_classifier.sampling(X_train, y_train)
    embedding_layer = base_classifier.word_to_vec(word_index)
    inputs, output = base_classifier.build_model(embedding_layer)
    history, model = base_classifier.train_model(inputs, output, X_train, y_train)
    print(model.summary())
    base_classifier.evaluate_model(history, model, X_test, y_test)
    base_classifier.predict(model, X_test, y_test)
    with open('Base Classifier.json', "w") as json_file:
        json_file.write(model.to_json())
        
#     

In [70]:
if __name__=="__main__": main()

Found 6206 unique tokens.
[[2115    2  808 ...    0    0    0]
 [4109 2961 3772 ...    4   36   63]
 [ 275    7   10 ...    1    8  745]
 ...
 [   8  331   37 ...    0    0    0]
 [   4    5   33 ...    0    0    0]
 [   5   11  124 ...    0    0    0]]
Shape of X train tensor: (17116, 150)
Shape of label train tensor: (17116, 10)
[[0 0 0 ... 0 1 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
(23985, 150) (23985, 10)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17988 samples, validate on 5997 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 1.00000, saving model to weights.h5
Epoch 2/10
 3200/17988 [====>.........................] - ETA: 1:14 - loss: 4.8708 - acc: 0.5041

KeyboardInterrupt: 

NameError: name 'fairness_classifier' is not defined