In [1]:
%matplotlib inline

from Models.functions.plot import ROC, plot_confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from Models.functions.datasets import getDatasets
import pandas as pd
from nltk.corpus import stopwords
import re, string
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from Models.functions.plot import ROC, plot_confusion_matrix

In [3]:
from bs4 import BeautifulSoup

def labelEncoder(y):
    le = LabelEncoder()
    le.fit(y)

    return (le.transform(y), len(le.classes_), list(le.classes_))

def clean(doc):
    """
    Cleaning a document by several methods:
        - Lowercase
        - Removing whitespaces
        - Removing numbers
        - Removing stopwords
        - Removing punctuations
        - Removing short words
    """
    stop_words = set(stopwords.words('portuguese'))
    
    # Lowercase
    # doc = doc.lower()    
    # Remove HTML codes
    doc = BeautifulSoup(doc).get_text()    
    # Remove numbers
    # doc = re.sub(r"[0-9]+", "", doc)
    # remove HTML space code
    # tokens = tokens.replace('&nbsp', string.whitespace)
    # Split in tokens
    tokens = doc.split()
    # Remove Stopwords
    tokens = [w for w in tokens if not w in stop_words]
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [4]:
task = 'age'
dataset_name = 'b5post'

datasets = getDatasets(task,'df', dataset_name)
for i in datasets.iterrows():

    name = i[1]['dataset_name']
    label = task
    ds_path = i[1]['path']

    # load training and test dataframes
    training_path = ds_path + '/' + i[1]['training']        
    #test_path = ds_path + '/' + i[1]['test']      

    df_training = pd.read_csv(training_path)#, usecols=cols)        
    #df_test = pd.read_csv(test_path)#, usecols=cols)        

    df_training['text'] = df_training['text'].apply(clean)
    #df_test['text'] = df_test['text'].apply(clean)
    X = df_training['text'].values
    y, n_classes, classes_name = labelEncoder(df_training[label].values)


In [5]:
len(df_training)

413

In [6]:
def max_length(lines):
    """
    Calculate the maximum document length
    """
    return max([len(s.split()) for s in lines])

In [7]:
max_length = max_length(X)

result = [len(x.split()) for x in X]
print('Text informations:')
print('max length: %i / min length: %i / mean length: %i ' % (np.max(result),
                                                                np.min(result),
                                                                np.mean(result)))

Text informations:
max length: 13257 / min length: 5 / mean length: 1356 


In [8]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = TfidfVectorizer().fit(corpus)
    # vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [9]:
print(df_training.groupby('age').size())
c = df_training.groupby('age').size().index

age
a18-20    151
a23-25    149
a28-61    113
dtype: int64


In [10]:
for i in range(n_classes):
    c1 = df_training[df_training[task] == c[i]]
    print(classes_name[i])
    text = [line.replace("\n", "") for line in c1.text]
    common_words = get_top_n_words(text, 10)
    for word, freq in common_words:
        print(word, freq)
    print("--")

a18-20
kkk 15.979267484563353
pra 10.78384720354864
dia 6.866689025860669
vida 6.42851646423661
amo 6.121366382072447
amor 5.917618962370609
tudo 5.689905327180669
eu 5.453057261057598
ser 5.372193072377578
sempre 5.330079930652694
--
a23-25
kkk 13.633582905704925
pra 10.842666305988573
dia 7.382414047442628
the 6.332044976063848
ser 6.31510427109672
vai 5.265019628541888
vida 5.088711823430247
hoje 4.827417837207707
não 4.7311203010830765
tudo 4.68773141459932
--
a28-61
kkk 8.91021131151838
dia 6.6435723365180595
pra 6.578095272590237
ser 4.448005020599045
vida 3.8935273526569536
deus 3.7615819916111364
hoje 3.728788939320164
não 3.6258170792201794
bom 3.5654900783899084
todos 3.466517565373286
--


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vec = TfidfVectorizer().fit(X_train)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [12]:
from sklearn.linear_model import LogisticRegressionCV


clf = LogisticRegressionCV(cv=5, max_iter=500, multi_class='auto')
clf = clf.fit(X_train, y_train)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=classes_name))
print()
cm = confusion_matrix(y_test, y_pred)
print(cm)
#plot_confusion_matrix(cm, classes_name)

              precision    recall  f1-score   support

      a18-20       0.47      0.61      0.53        28
      a23-25       0.57      0.47      0.52        34
      a28-61       0.63      0.57      0.60        21

   micro avg       0.54      0.54      0.54        83
   macro avg       0.56      0.55      0.55        83
weighted avg       0.55      0.54      0.54        83


[[17 10  1]
 [12 16  6]
 [ 7  2 12]]


In [14]:
X_train.shape, X_test.shape

((330, 47102), (83, 47102))

In [15]:
import cnn_model

# EMBEDDING
MAX_NUM_WORDS  = 50000 #15000
EMBEDDING_DIM  = 300
MAX_SEQ_LENGTH = X_train.shape[1] or 3200 #200
USE_GLOVE      = False

# MODEL
FILTER_SIZES   = [3,4,5]
FEATURE_MAPS   = [10,10,10]
DROPOUT_RATE   = 0.5

# LEARNING
BATCH_SIZE     = 20
NB_EPOCHS      = 40
RUNS           = 5
VAL_SIZE       = 0.2

def create_model():
    
    model = cnn_model.build_cnn(
            embedding_layer=emb_layer,
            num_words=MAX_NUM_WORDS,
            embedding_dim=EMBEDDING_DIM,
            filter_sizes=FILTER_SIZES,
            feature_maps=FEATURE_MAPS,
            max_seq_length=MAX_SEQ_LENGTH,
            dropout_rate=DROPOUT_RATE
    )
    
    model.compile(
            loss='binary_crossentropy',
            optimizer=Adadelta(clipvalue=3),
            metrics=['accuracy']
    )
    return model

Using TensorFlow backend.


In [17]:
#import cnn_model
#from sklearn.model_selection import StratifiedKFold
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier

histories = []
test_loss = []
test_accs = []

predicted_y = []
expected_y = []

#K = StratifiedKFold(n_splits=2)

X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

emb_layer = 'non-static'
#if USE_GLOVE:
    #emb_layer = create_glove_embeddings()


cnn = create_model()

model = KerasClassifier(build_fn=cnn)

history = model.fit(
    X_train, y_train,
    epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=0,
    validation_data=(X_val, y_val),
    callbacks=[#ModelCheckpoint('model-%i.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min'),
               ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
               EarlyStopping(monitor='val_loss', min_delta=0.1, patience=4, verbose=1)
              ]
)

y_pred = model.predict(X_test, verbose=1)

Creating CNN 0.0.1
#############################################
Embedding:    using pre-trained embedding
Vocabulary size: 50000
Embedding dim: 300
Filter sizes: [3, 4, 5]
Feature maps: [10, 10, 10]
Max sequence: 47102
#############################################


TypeError: 'str' object is not callable

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=classes_name))
print()
cm = confusion_matrix(y_test, y_pred)
print(cm)
#plot_confusion_matrix(cm, classes_name)