In [3]:
from Models.functions.preprocessing import clean, labelEncoder
from Models.functions.datasets import loadTrainTest
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif

import numpy as np

In [4]:
params = dict(
    features_maps = [50,50],
    kernel_size = [3,4],
    strides = [1,1],
    dropout_rate = 0.3,
    epochs = 100,
    batch_size = 32,
    embedding_dim = 100,
    max_seq_length = 500,
    max_num_words = 150000,
    pool_size = [2,2,2]
)

In [5]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):
    try:
        X_resampled, y_resampled = SMOTE().fit_resample(X, y)
    except:        
        X_resampled, y_resampled = X, y
        
    return X_resampled, y_resampled

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold

root= '/home/rafael/Dataframe/'
dataset_name = 'brmoral'
lang = 'pt'
task = 'education'
reglog = LogisticRegression(C=100, penalty='l2', multi_class='auto', class_weight='balanced', solver='liblinear')

X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)

In [17]:
X[0][0:200]

'Como direito individual , cada ser humano tem o direito de escolha de preferência de gênero , assim como direito ao matrimônio . Logo é dever da lei em cumprir o direito e liberdade de livre arbítrio '

In [18]:
X = X.apply(clean, lang=lang)
X = X.values # mandatory for pan13

y, n_classes, classes_names = labelEncoder(y)
params['n_classes'] = n_classes

X_train, X_test, y_train, y_test = train_test_split(X, y)

# feature transform
vect = TfidfVectorizer(max_features=None, ngram_range=(1,1), analyzer='word').fit(X_train)

X_train = vect.transform(X_train)

clf = reglog.fit(X_train, y_train)       

# test
X_test1 = vect.transform(X_test)
predicted = clf.predict(X_test1)

In [26]:
X_test1.toarray(), X_test1.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), (87, 8434))

In [20]:
X_test[0]

'cada indivíduo liberdade escolher deseja passar vida união matrimonial , basta existir sentimento mútuo amor entendimento , necessariamente sexos opostos . , grande problema porte armas brasil , é fato vivemos cultura ódio sociedade , , posse armas legalizada , ainda muitos casos homicídios , acredito legalização armas , número somente irá crescer exponencialmente . , cabe cada indivíduo decidir irá fazer vida filho irá conceber , realidade , acredito decisão extremamente difícil delicada mãe , optar aborto , cabe estado negar opção mulheres , pois legalizado , existem clinicas clandestinas fazem operação muitas vezes colocam vida dois risco . , " pena morte é " " saída " " cruel irresponsável parte estado , pois sistema carcerário é precário sobrecarregado diversos fatores , necessidades básicas deveriam ser oferecidas estado saúde , educação moradia , nega fornecer , melhor dizendo , oferece forma totalmente precária deixando população carente mercê sorte . muitas vezes alternativa 

In [23]:
predicted[0], classes_names

(0, array(['s012', 's3', 's4'], dtype=object))

In [24]:
clf.predict(vect.transform(["minha educasao fui muito ruim, n tive opurtunidade de estudar"]))

array([0])

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold

root= '/home/rafael/Dataframe/'
dataset_name = 'brblogset'
lang = 'pt'
task = 'education'
reglog = LogisticRegression(C=100, penalty='l2', multi_class='auto', class_weight='balanced', solver='liblinear')

X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)

# small sample

X = X.apply(clean, lang=lang)
X = X.values # mandatory for pan13

y, n_classes, classes_names = labelEncoder(y)
params['n_classes'] = n_classes

max_length = np.max([len(x.split(" ")) for x in X])
mean_length = int(np.mean([len(x.split(" ")) for x in X]))
median_length = int(np.median([len(x.split(" ")) for x in X]))



X_resampled, y_resampled = oversampling(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled)
# feature transform
vect = TfidfVectorizer(max_features=None, ngram_range=(1,1), analyzer='word').fit(X_train)

X_train = vect.transform(X_train)

clf = reglog.fit(X_train, y_train)       

# test
X_test = vect.transform(X_test)
predicted = clf.predict(X_test)
    #print(classification_report(y_test, predicted))
    print("without k-best {0}".format(f1_score(y_test, predicted, average='weighted')))

    predicted1 = []
    expected1 = []

    predicted2 = []
    expected2 = []

    max_features = 130000

    steps = 10000

    kvalues = [i for i in range(500, max_features, steps)]
    kvalues.append('all')

    for kvalue in kvalues:
        K = StratifiedKFold(n_splits=3)    

        # Cross validation KFolds
        for train_index, test_index in K.split(X, y):

            X_resampled, y_resampled = oversampling(X, y)

            X_train, X_test = X_resampled[train_index], X_resampled[test_index]
            y_train, y_test = y_resampled[train_index], y_resampled[test_index]

            # feature transform
            vect = TfidfVectorizer(max_features=max_features, ngram_range=(1,1), analyzer='word').fit(X_train)
            X_train = vect.transform(X_train).toarray()

            # feature selection
            sel = SelectKBest(k_best_func,k=kvalue)
            ft = sel.fit(X_train, y_train)
            train_best = ft.transform(X_train)

            # train
            clf1 = reglog.fit(train_best, y_train)

            # test
            X_test = vect.transform(X_test).toarray()
            test_best = ft.transform(X_test)
            pred = clf1.predict(test_best)

            expected1.extend(y_test)
            predicted1.extend(pred)        

        #print(classification_report(y_test, predicted))    
        print("kvalue {0} f1score {1}".format(kvalue, f1_score(expected1, predicted1, average='weighted')))

In [54]:
#chi2, f_regression, f_classif
#test(chi2)

In [6]:
#test(f_regression)

In [7]:
#test(f_classif)

### Mean Word Embeddings

### Blinded Models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

root= '/home/rafael/Dataframe/'
dataset_name = 'brblogset'
lang = 'pt'
task = 'education'
reglog = LogisticRegression(C=100, penalty='l2', multi_class='auto', class_weight='balanced', solver='liblinear')

X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)

# small sample
X = X.apply(clean, lang=lang)
#X = X.values # mandatory for pan13

y, n_classes, classes_names = labelEncoder(y)    
params['n_classes'] = n_classes

max_length = np.max([len(x.split(" ")) for x in X])
mean_length = int(np.mean([len(x.split(" ")) for x in X]))
median_length = int(np.median([len(x.split(" ")) for x in X]))

params['max_seq_length'] = int(mean_length)

In [11]:
def onevsrest(X, y_a):
    max_features = 130000

    K = StratifiedKFold(n_splits=3)
    
    expected1 = []
    predicted1 = []
    
    # Cross validation KFolds
    for train_index, test_index in K.split(X, y_a):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_a[train_index], y_a[test_index]
        
        # feature transform
        vect = TfidfVectorizer(max_features=None, ngram_range=(1,1), analyzer='word').fit(X_train)
        X_train = vect.transform(X_train).toarray()
        X_test = vect.transform(X_test).toarray()

        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        X_test, y_test = SMOTE().fit_resample(X_test, y_test)
        
        print(len(np.where(y_train == 0)), len(np.where(y_train == 1)), len(np.where(y_test == 0)), len(np.where(y_test == 1)))

        # train
        clf1 = reglog.fit(X_train, y_train)

        # test    
        pred = clf1.predict(X_test)

        expected1.extend(y_test)
        predicted1.extend(pred)        

    print(classification_report(expected1, predicted1))    
    print(confusion_matrix(expected1, predicted1))    
    print("f1score {1}".format(f1_score(expected1, predicted1, average='weighted')))
    
    

current_class = 0
print("Class {0}".format(classes_names[current_class]))
labels = lambda x: 1 if x != current_class else 0
y_a = np.array(list(map(labels, y)))
onevsrest(X, y_a)


current_class = 1
print("Class {0}".format(classes_names[current_class]))
labels = lambda x: 1 if x != current_class else 0
y_a = np.array(list(map(labels, y)))
onevsrest(X, y_a)

current_class = 2
print("Class {0}".format(classes_names[current_class]))
labels = lambda x: 1 if x != current_class else 0
y_a = np.array(list(map(labels, y)))
onevsrest(X, y_a)

current_class = 3
print("Class {0}".format(classes_names[current_class]))
labels = lambda x: 1 if x != current_class else 0
y_a = np.array(list(map(labels, y)))
onevsrest(X, y_a)



Class Básico
1 1 1 1
1 1 1 1
1 1 1 1
              precision    recall  f1-score   support

           0       0.73      0.34      0.46      1011
           1       0.57      0.88      0.69      1011

   micro avg       0.61      0.61      0.61      2022
   macro avg       0.65      0.61      0.58      2022
weighted avg       0.65      0.61      0.58      2022

[[343 668]
 [124 887]]


IndexError: tuple index out of range

### Next steps

In [12]:
vect = TfidfVectorizer(max_features=None, ngram_range=(1,1), analyzer='word').fit(X)


In [13]:
#vect.idf_
vect.vocabulary_
#vect.idf_[i]

{'cafua': 34126,
 'feijoada': 77463,
 'meada': 113404,
 'inexisto': 95621,
 'semicantão': 155368,
 'maisnova': 110261,
 'luzir': 109139,
 'xylitol': 179333,
 'er': 69686,
 'honrados': 90855,
 'requisitada': 147825,
 'naji': 120107,
 'correremos': 48529,
 'ode': 124232,
 'detenções': 58681,
 'chavista': 39341,
 'intímas': 98480,
 'modificava': 116975,
 'botinha': 31492,
 'cotação': 49037,
 'observamos': 123787,
 'perestrello': 131289,
 'naysa': 120691,
 'iyá': 100050,
 'emagrece': 65529,
 'trôpegos': 170838,
 'idyllic': 92367,
 'bloise': 30442,
 'morarmos': 117777,
 'desmotivadas': 57554,
 'despachou': 57739,
 'óxido': 181149,
 'sitiou': 158199,
 'rocs': 150488,
 'ei20048': 64678,
 'sérgia': 163719,
 '32ºc': 5553,
 'financiei': 78772,
 'conjug': 45766,
 'musqueira': 119363,
 'supramencionados': 162881,
 'gwen': 87794,
 'especializado': 71396,
 'nhã': 121699,
 'conectadas': 45112,
 'moderadamente': 116900,
 'bioe': 29763,
 'englishness': 67888,
 'improbabilidade': 93916,
 'direcionadores

In [14]:
from Models.functions.transform import tokenizer_pad_sequence
from Models.functions.vectors import create_embeddings, train_vectors


Using TensorFlow backend.


In [15]:

_, _, _, vect = tokenizer_pad_sequence(X, int(mean_length), params['max_num_words'])

vectors_filename = r'/home/rafael/GDrive/Embeddings/word2vec/'+ dataset_name +'_sg_'+ str(params['embedding_dim']) +'dim.model'        
embedding_type = 1
embedding_matrix = create_embeddings(vect, params['max_num_words'], params['max_seq_length'], name=dataset_name, embedding_dim=params['embedding_dim'], filename=vectors_filename, type=embedding_type, return_matrix=True)


corpus = []
for instance in X_train:
    i = []
    for x in instance.split(" "):
        i.append(x if embedding_matrix[x] else np.zeros(params['embedding_dim']))
    corpus.append(i)

loading embeddings...
Vocab keys 881057
Found 881057 word vectors.
weights 150000


NameError: name 'X_train' is not defined

In [49]:
import pandas as pd
dfs = pd.DataFrame()

labels = np.unique(y)

for label in labels:
    pass

ids = np.where(y==label)[0]
vect = TfidfVectorizer(max_df=0.85, max_features=None, ngram_range=(1,1), analyzer='word').fit(X[ids])
keys = list(vect.vocabulary_.keys())

#idf
#feats_df = vect.vo
#feats_df.label = label

#dfs.append(feats_df)



In [53]:
np.sort(vect.idf_, kind='heapsort')[:10]

array([1.189242  , 1.19388238, 1.25625271, 1.28640575, 1.29152085,
       1.29666225, 1.29666225, 1.31224698, 1.31749634, 1.32807844])

In [None]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

### Hipothesis

- Feature selection

I was a bit result improvements, from 0.37 to 0.41 f1score.

The second step maybe try to test with CNN and RNN

- One vs Rest

Understand more how i can do this.

Model for one label vs rest of labels seems that dont work

Maybe using oversampling/undersampling


- Anything more?

using the mean word embedding with simple CNN to try and test 