In [1]:

import numpy as np
import pandas as pd
import pickle

from Models.functions.plot import plot_history, full_multiclass_report, plot_confusion_matrix
from Models.functions.preprocessing import clean, labelEncoder
from Models.functions.datasets import loadTrainTest
from Models.functions.utils import checkFolder, listProblems
from Models.functions.transform import tokenizer_pad_sequence
from Models.functions.vectors import create_embeddings, train_vectors
from Models.functions.cnn_model import build_cnn1


from keras.layers import Activation, Input, Dense, Flatten, Dropout, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras import regularizers
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report, confusion_matrix

import itertools
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE, ADASYN
import collections, numpy
import gc
from time import time, sleep

results_dataframe = "/reports_grid/results.csv"
try:
    results = pd.read_csv(results_dataframe)
except:
    results = pd.DataFrame()

# In[4]:

import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score



Using TensorFlow backend.


In [2]:

def create_model(filters = [10], kernel_size = [5], strides = [100], 
                 dropout_rate = 0.5, pool_size = [5], dense_units = 512, max_len = 1000, n_classes = 2, optimizer = 'rmsprop'):

    model = Sequential()

    # conv 1
    model.add(Conv1D(filters = filters[0], 
                     kernel_size = kernel_size[0],
                     strides = strides[0], 
                     activation = 'relu', 
                     input_shape = (max_len, 100)))
                     #activity_regularizer = regularizers.l2(0.2)))

    # pooling layer 1
    
    model.add(MaxPooling1D(pool_size = pool_size[0], strides = 1))
    model.add(Activation('relu'))
    """
    model.add(Conv1D(filters = filters[1], 
                     kernel_size = kernel_size[1],
                     strides = strides[0], 
                     activation = 'relu',
                     activity_regularizer = regularizers.l2(0.2)))
    
    model.add(MaxPooling1D(pool_size = pool_size[1], strides = 1))
    model.add(Activation('relu'))

    model.add(Conv1D(filters = filters[2], 
                     kernel_size = kernel_size[2],
                     strides = strides[0], 
                     activation = 'relu',
                     activity_regularizer = regularizers.l2(0.2)))
    
    model.add(MaxPooling1D(pool_size = pool_size[2], strides = 1))
    model.add(Activation('relu'))
    """
    model.add(Flatten())
    
    if dropout_rate is not None:
        model.add(Dropout(dropout_rate))
        
    model.add(Dense(units = dense_units, activation = 'relu'))
    model.add(Dense(units = n_classes, activation = 'softmax'))

    #TODO: test others foss functions: https://keras.io/losses/
    model.compile(optimizer = optimizer, loss='categorical_crossentropy', metrics = ['accuracy'])
    return model

In [3]:
def garbage_collection(): 
    gc.collect()
    print("gargabe colletion...")
    sleep(3)

# In[6]:

task = "gender"
dataset_name = "brmoral"
lang = "pt"
root = "/home/rafael/Dataframe/"

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):
    try:
        X_resampled, y_resampled = SMOTE().fit_resample(X, y)
    except:
        X_resampled, y_resampled = X, y
        
    return X_resampled, y_resampled
    # return X, y

def train_val_metrics(histories):
    print('Training: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'loss'), get_avg(histories, 'acc')))
    print('Validation: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'val_loss'), get_avg(histories, 'val_acc')))

def get_avg(histories, his_key):
    tmp = []
    for history in histories:
        tmp.append(history[his_key][np.argmin(history['val_loss'])])
    return np.mean(tmp)

In [4]:
import sys

run_all = True

g_root              = '/home/rafael/Dataframe/'

filter_dataset_name = 'brblogset'

filter_task         = 'education'

report_version = 'Reports_emb'

if run_all == True:
    args = []
    problems = listProblems(filter_dataset_name, filter_task)
    print("############################################")
    print(" RUNNING {0} PROBLEMS".format(len(problems)))

    # create a list of tasks
    for task, dataset_name, lang in problems:

        #args.append([task, dataset_name, g_root, lang])
        print(" Dataset: ",dataset_name," / Task:",task," / Lang:",lang)
        #run(task, dataset_name, g_root, lang)    
print(task, dataset_name, g_root, lang)

############################################
 RUNNING 1 PROBLEMS
 Dataset:  brblogset  / Task: education  / Lang: pt
education brblogset /home/rafael/Dataframe/ pt


In [5]:
#def run(task, dataset_name, root, lang, report_version = 'Reports'):

histories = []
test_loss = []
test_accs = []

predicted_y = []
predicted_y_proba = []
expected_y = []

directory='./'+ report_version +'/'+task+'/'+dataset_name+'_'+lang+'/'

checkFolder(directory)

X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)

X = X.apply(clean, lang=lang)
X = X.values # mandatory for pan13

y, n_classes, classes_names = labelEncoder(y)    

max_length = np.max([len(x.split(" ")) for x in X])
mean_length = np.mean([len(x.split(" ")) for x in X])
median_length = np.median([len(x.split(" ")) for x in X])

print("max: ", max_length, " / mean: ", mean_length, " / median: ", median_length)

max:  142632  / mean:  3224.6881463802706  / median:  1595.0


In [6]:
params = dict(
    filters = [50, 50, 50],
    kernel_size = [3],
    strides = [1, 1, 1],
    dropout_rate = 0.15,
    pool_size = [2],
    epochs = 100,
    batch_size = 12,
    embedding_dim = 100,
    dense_units = [512],
    max_num_words = None,
    max_seq_length = None
)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

text = X
length = [len(x) for x in text]

tokenizer = Tokenizer(num_words=None, char_level=True)

tokenizer.fit_on_texts(text)

sequences = tokenizer.texts_to_sequences(text)

word_index = tokenizer.word_index

# MAX_SEQ_LENGTH = np.max(arr_length)
params['max_seq_length'] = int(np.mean(length))


# Padding all sequences to same length of `max_seq_length`
X2 = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
max_seq_length

In [None]:
X2.shape

In [None]:
#tokenizer

In [None]:
tfidf = TfidfVectorizer(ngram_range=(3,3), analyzer='char')
tfidf.fit(X)

In [7]:
#vect.index_word[648], 
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors


#vectorName = r'/home/rafael/GDrive/Embeddings/fasttext/'+ dataset_name +'_sg_'+ str(params['embedding_dim']) +'dim.model'        
vectorName = r'/home/rafael/GDrive/Embeddings/nilc/fasttext_pt_skip_s100.txt'
embedding_type = 1
#model_ug_sg = Word2Vec.load(vectorName)
model_ug_sg = KeyedVectors.load_word2vec_format(vectorName, binary=False, unicode_errors="ignore")

embeddings_index = {}
for w in model_ug_sg.wv.vocab.keys():
    #embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
    embeddings_index[w] = model_ug_sg.wv[w]

print('Found %s word vectors.' % len(embeddings_index))


  del sys.path[0]
  from ipykernel import kernelapp as app


Found 929605 word vectors.


In [8]:
num_words = int(mean_length)
embedding_dim = 100
print("num words {0} / emb_dim {1}".format(num_words, embedding_dim))

new_X = []
for texts in X:        
    instance = np.zeros((num_words, embedding_dim))
    i = 0
    for word in texts.split(" "):
        new_word = []
        for w in word:
            if w in 'abcdefghijklmnopqrstuvxyz':
                new_word.append(w)
        word = "".join(new_word)
        
        
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is not None:            
            instance[i] = embedding_vector
        else:
            instance[i] = np.zeros(embedding_dim)
            
        i += 1
    new_X.append(instance)

new_X = np.array(new_X)

print("new_X", new_X.shape)

num words 3224 / emb_dim 100
new_X (1257, 3224, 100)


In [9]:
# words not found
count_zeros = lambda x: 1 if sum(x) == 0 else 0
c2 = 0
c1 = 0
for x in new_X:
    s = sum(list(map(count_zeros, x)))
    c1 += s
    c2 += len(x)
    
print("zeros {0} / total {1}".format(c1, c2))
print( c1 / c2 )

zeros 2251884 / total 4052568
0.555668405810834


In [None]:
X1 = []
m = 0
for x in X:
    i = []
    for index in range(3, len(x), 1):
        ch = str(x[index-3:index])
        if ch in tfidf.vocabulary_:
            index = tfidf.vocabulary_[ch]
            try:
                emb = embedding_matrix[index]
            except:
                emb = np.zeros(100)
            idf = 1            
            emb = emb * idf
        else:
            m += 1
            emb = np.zeros(100)
            
        i.append(emb)
        pass
    X1.append(np.mean(i, axis=1))
    pass
X1 = np.array(X1)


print(m)

In [10]:
y1 = to_categorical(y, n_classes)

X_train, X_test, y_train, y_test = train_test_split(new_X, y1, test_size = 0.2)

In [11]:
X_train.shape, y_train.shape, X_test.shape, X_train.shape[2]

((1005, 3224, 100), (1005, 4), (252, 3224, 100), 100)

In [12]:
model = None
model = create_model(
                        max_len=X_train.shape[1],
                        n_classes=n_classes,
                        filters=params['filters'],
                        kernel_size=params['kernel_size'],
                        strides=params['strides'],                        
                        dropout_rate=params['dropout_rate'],
                        pool_size=params['pool_size']
                    )
"""
model = build_cnn1(
                embedding_layer=embedding_layer,
                num_words=params['max_num_words'],
                embedding_dim=params['embedding_dim'],
                filter_sizes=[3],
                feature_maps=[50],
                max_seq_length=100,
                dropout_rate=params['dropout_rate'] or None,
                dense_units=params['dense_units'] or 512,
                n_classes=n_classes,
                pool_size=[2,2],
                strides=[1,2]
        )
"""

model.compile(
                loss='categorical_crossentropy',
                #loss='mean_squared_error',
                optimizer='rmsprop',
                metrics=['accuracy','mae']
        )

model.summary()
        
## Then train it and display the results
history = model.fit(X_train,
                    y_train,                            
                    validation_split=0.2,   
                    verbose = 0,
                    batch_size=params['batch_size'],                                
                    epochs=500,
                    callbacks=[
                        #ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
                        EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=0)
                ])        

y_pred_proba = model.predict(X_test, batch_size=params['batch_size'])
predicted_y_proba.extend(y_pred_proba)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 3222, 50)          15050     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 3221, 50)          0         
_________________________________________________________________
activation_1 (Activation)    (None, 3221, 50)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 161050)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 161050)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               82458112  
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 2052      
Total para

In [13]:
print()
y_test = np.argmax(y_test,axis=1)
y_pred = y_pred_proba.argmax(axis=1)

# 3. Print accuracy score
print("Accuracy : "+ str(accuracy_score(y_test,y_pred)))    
print("F1-Score : "+ str(f1_score(y_test,y_pred,average="macro")))
print("")

# 4. Print classification report
print("Classification Report")
print(classification_report(y_test,y_pred, digits=3, target_names=classes_names, output_dict=False))

print(confusion_matrix(y_test,y_pred))

plot_confusion_matrix(confusion_matrix(y_test,y_pred), classes=classes_names, directory='/tmp/', normalize=True)



Accuracy : 0.3412698412698413
F1-Score : 0.3016862804131136

Classification Report
                     precision    recall  f1-score   support

             Básico      0.450     0.188     0.265        48
       Pós-graduado      0.369     0.488     0.421        84
  Superior Completo      0.312     0.417     0.357        72
Superior Incompleto      0.240     0.125     0.164        48

          micro avg      0.341     0.341     0.341       252
          macro avg      0.343     0.304     0.302       252
       weighted avg      0.344     0.341     0.324       252

[[ 9 18 17  4]
 [ 5 41 30  8]
 [ 6 29 30  7]
 [ 0 23 19  6]]
Normalized confusion matrix
[[0.1875     0.375      0.35416667 0.08333333]
 [0.05952381 0.48809524 0.35714286 0.0952381 ]
 [0.08333333 0.40277778 0.41666667 0.09722222]
 [0.         0.47916667 0.39583333 0.125     ]]


<matplotlib.figure.Figure at 0x7f9b7c69bcc0>

In [None]:

K = StratifiedKFold(n_splits=3)
idx = 0
for train_index, test_index in K.split(X, y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 
    vect = None
    vect = TfidfVectorizer(max_features=None)        
    X_train = vect.fit_transform(X_train).toarray()
    X_test = vect.transform(X_test).toarray()

    X_train, y_train = oversampling(X_train, y_train)
    X_test,  y_test  = oversampling(X_test, y_test)

    y_train = to_categorical(y_train, n_classes)
    y_test  = to_categorical(y_test, n_classes)

    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    # validation
    validation_split = 0.1
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = validation_split) 



    print("TFIDF")      

    print("Word embedding")

    _, _, _, vect  = tokenizer_pad_sequence(X[train_index], params['max_num_words'],  params['max_seq_length'])    

    vectors_filename = r'/home/rafael/GDrive/Embeddings/word2vec/'+ dataset_name +'_sg_'+ str(params['embedding_dim']) +'dim.model'        
    embedding_type = 1
    embedding_matrix = create_embeddings(vect, params['max_num_words'], params['max_seq_length'], name=dataset_name, embedding_dim=params['embedding_dim'], filename=vectors_filename, type=embedding_type, return_matrix=True)        
    print(embedding_matrix.shape)


    ## create the model with the best params found
    #model = KerasClassifier(build_fn=create_model,
    model = None
    model = create_model(
                            max_len=X_train.shape[1],
                            n_classes=n_classes,
                            filters=params['filters'],
                            kernel_size=params['kernel_size'],
                            strides=params['strides'],                        
                            dropout_rate=params['dropout_rate'],
                            pool_size=params['pool_size']
                        )

    ## Then train it and display the results
    history = model.fit(X_train,
                        y_train,                            
                        validation_data=(X_val, y_val),                            
                        verbose = 1,
                        batch_size=params['batch_size'],                                
                        epochs=params['epochs'],
                        callbacks=[
                            #ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
                            EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=0)
                    ])        

    y_pred_proba = model.predict(X_test, batch_size=params['batch_size'])
    predicted_y_proba.extend(y_pred_proba)

    binary = False # True if len(classes) < 3 else False
    # 1. Transform one-hot encoded y_test into their class number
    if not binary:
        y_test = np.argmax(y_test,axis=1)

    # 2. Predict classes and stores 
    #y_pred = model.predict(X_test, batch_size=params['batch_size'])        
    y_pred = y_pred_proba.argmax(axis=1)

    predicted_y.extend(y_pred)
    expected_y.extend(y_test)
    histories.append(history.history)
    garbage_collection()

del X, y, model, vect

expected_y = np.array(expected_y)
predicted_y = np.array(predicted_y)
predicted_y_proba = np.array(predicted_y)

np.save(directory + '/expected.numpy', expected_y)
np.save(directory + '/predicted.numpy', predicted_y)
np.save(directory + '/predicted_proba.numpy', predicted_y_proba)
with open(directory + '/histories.pkl', 'wb') as f:
    pickle.dump(histories, f)

# metrics    
train_val_metrics(histories)

# plot_history(histories, directory)

# y_pred = model.predict(x, batch_size=batch_size)

# 3. Print accuracy score
print("Accuracy : "+ str(accuracy_score(expected_y,predicted_y)))    
print("F1-Score : "+ str(f1_score(expected_y,predicted_y,average="macro")))    
print("")

# 4. Print classification report
print("Classification Report")
report = pd.DataFrame(
    classification_report(expected_y, predicted_y, digits=3, target_names=classes_names, output_dict=True)
)
report = report.transpose()
accuracy = accuracy_score(expected_y, predicted_y)
report['accuracy'] = [accuracy] * (n_classes + 3)    
report.to_csv(directory + '/report.csv')
print(report)

# 5. Plot confusion matrix
cnf_matrix = confusion_matrix(expected_y,predicted_y)    
np.save(directory + "/confusion_matrix", np.array(cnf_matrix))    
plot_confusion_matrix(cnf_matrix, classes=classes_names, directory=directory, normalize=True)

# 6. Clean memory
garbage_collection()
gc.collect()

print("+"+"-"*50+"+")
print()