In [10]:
from Models.functions.preprocessing import clean, labelEncoder
from Models.functions.datasets import loadTrainTest
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif

import numpy as np

In [11]:
params = dict(
    features_maps = [50,50],
    kernel_size = [3,4],
    strides = [1,1],
    dropout_rate = 0.3,
    epochs = 100,
    batch_size = 32,
    embedding_dim = 100,
    max_seq_length = None,
    max_num_words = 150000,
    pool_size = [2,2,2]
)

In [12]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):
    try:
        X_resampled, y_resampled = SMOTE().fit_resample(X, y)
    except:
        X_resampled, y_resampled = X, y
        
    return X_resampled, y_resampled

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold

root= '/home/rafael/Dataframe/'
dataset_name = 'brblogset'
lang = 'pt'
task = 'education'
reglog = LogisticRegression(C=100, penalty='l2', multi_class='auto', class_weight='balanced', solver='liblinear')

X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)

# small sample

X = X.apply(clean, lang=lang)
X = X.values # mandatory for pan13

y, n_classes, classes_names = labelEncoder(y)    
params['n_classes'] = n_classes

max_length = np.max([len(x.split(" ")) for x in X])
mean_length = int(np.mean([len(x.split(" ")) for x in X]))
median_length = int(np.median([len(x.split(" ")) for x in X]))

def test(k_best_func):
    # train    
    print(k_best_func)
    
    X_resampled, y_resampled = oversampling(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled)
    # feature transform
    vect = TfidfVectorizer(max_features=None, ngram_range=(1,1), analyzer='word').fit(X_train)

    X_train = vect.transform(X_train)

    clf = reglog.fit(X_train, y_train)       

    # test
    X_test = vect.transform(X_test)
    predicted = clf.predict(X_test)
    #print(classification_report(y_test, predicted))
    print("without k-best {0}".format(f1_score(y_test, predicted, average='weighted')))

    predicted1 = []
    expected1 = []

    predicted2 = []
    expected2 = []

    max_features = 130000

    steps = 10000

    kvalues = [i for i in range(500, max_features, steps)]
    kvalues.append('all')

    for kvalue in kvalues:
        K = StratifiedKFold(n_splits=3)    

        # Cross validation KFolds
        for train_index, test_index in K.split(X, y):

            X_resampled, y_resampled = oversampling(X, y)

            X_train, X_test = X_resampled[train_index], X_resampled[test_index]
            y_train, y_test = y_resampled[train_index], y_resampled[test_index]

            # feature transform
            vect = TfidfVectorizer(max_features=max_features, ngram_range=(1,1), analyzer='word').fit(X_train)
            X_train = vect.transform(X_train).toarray()

            # feature selection
            sel = SelectKBest(k_best_func,k=kvalue)
            ft = sel.fit(X_train, y_train)
            train_best = ft.transform(X_train)

            # train
            clf1 = reglog.fit(train_best, y_train)

            # test
            X_test = vect.transform(X_test).toarray()
            test_best = ft.transform(X_test)
            pred = clf1.predict(test_best)

            expected1.extend(y_test)
            predicted1.extend(pred)        

        #print(classification_report(y_test, predicted))    
        print("kvalue {0} f1score {1}".format(kvalue, f1_score(expected1, predicted1, average='weighted')))

In [16]:
#chi2, f_regression, f_classif
test(chi2)

<function chi2 at 0x7f22c0518510>
without k-best 0.38280478866235296
kvalue 500 f1score 0.36243228170994485
kvalue 10500 f1score 0.3788179664834903
kvalue 20500 f1score 0.38332446176543183
kvalue 30500 f1score 0.38937148578297487
kvalue 40500 f1score 0.3931316886030413
kvalue 50500 f1score 0.3957167902930736
kvalue 60500 f1score 0.39734938819269866
kvalue 70500 f1score 0.39834614394073586
kvalue 80500 f1score 0.3989369960610064
kvalue 90500 f1score 0.39979246092788334
kvalue 100500 f1score 0.4005394316641394
kvalue 110500 f1score 0.40124277605308617
kvalue 120500 f1score 0.4021995714038674
kvalue all f1score 0.40284021457716856


In [17]:
test(f_regression)

<function f_regression at 0x7f22c0518598>
without k-best 0.4150201281659471
kvalue 500 f1score 0.31741917310352735
kvalue 10500 f1score 0.33463976606079837
kvalue 20500 f1score 0.3340084522876956
kvalue 30500 f1score 0.33238739110730703
kvalue 40500 f1score 0.33299977628391614
kvalue 50500 f1score 0.3378590248537846
kvalue 60500 f1score 0.342481926128817
kvalue 70500 f1score 0.3453567583497951
kvalue 80500 f1score 0.34743937806250064
kvalue 90500 f1score 0.35000670626963293
kvalue 100500 f1score 0.352449079617735
kvalue 110500 f1score 0.3551640366281456
kvalue 120500 f1score 0.35949009648390157
kvalue all f1score 0.3638403201460455


In [18]:
test(f_classif)

<function f_classif at 0x7f22c0518400>
without k-best 0.38015688265534847
kvalue 500 f1score 0.3576694641008894
kvalue 10500 f1score 0.36562383204129456
kvalue 20500 f1score 0.37213684598912466
kvalue 30500 f1score 0.3776351693813369
kvalue 40500 f1score 0.37856001402995737
kvalue 50500 f1score 0.37723362155531176
kvalue 60500 f1score 0.37668953813538003
kvalue 70500 f1score 0.3769983126624702
kvalue 80500 f1score 0.37793881766634163
kvalue 90500 f1score 0.37947469331770434
kvalue 100500 f1score 0.3813366872416895
kvalue 110500 f1score 0.38255617196398906
kvalue 120500 f1score 0.38468373757761226
kvalue all f1score 0.3866670008439158


### Mean Word Embeddings

In [None]:
vectors_filename = r'/home/rafael/GDrive/Embeddings/word2vec/'+ ds_name +'_sg_'+ str(params['embedding_dim']) +'dim.model'
# vectors_filename = r'/home/rafael/GDrive/Embeddings/en_wordvectors/wiki-news-300d-1M.vec'
# vectors_filename = r'/home/rafael/GDrive/Embeddings/nilc/fasttext_pt_skip_s'+ str(params['embedding_dim']) +r'.txt'        
# WINDOWS
# vectors_filename = r'C:/Users/Rafael Sandroni/Google Drive/Mestrado/Data/Embeddings/fasttext/'+dataset_name+r'_sg_100dim.model'
# vectors_filename = r'/home/rafael/GDrive/Embeddings/nilc/fasttext_pt_skip_s'+ str(params['embedding_dim']) +r'.txt'        
embedding_type = 1

embedding_layer = create_embeddings(vect, params['max_num_words'], params['max_seq_length'], name=dataset_name, embedding_dim=params['embedding_dim'], filename=vectors_filename, type=embedding_type)
