In [1]:
import sklearn
print(sklearn.__version__)

0.20.0


### baseline2: tfidf + CNN

In [2]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from time import time

import numpy as np
import pandas as pd
#from functions.models import 
from functions.datasets import getDatasets
from functions.metrics import evaluator
from functions.plot import ROC, plot_confusion_matrix

In [2]:
baseline = 'baseline2'

In [5]:
def labelEncoder(y):
    le = LabelEncoder()
    le.fit(y)

    return (le.transform(y), len(le.classes_), list(le.classes_))

In [6]:
def clean(text):
    space = ' '    
    for char in ['\n','-', '...', '*', '/', '+', '\\']:
        text = text.replace(char, space)
        
    text = text.replace(space*2, space)
    text = text.replace(space*3, space)
    
    return text.strip()

### Perform classification for each problem / task

In [7]:
def model(X, y, n_classes, classes_name, params):
                
    pipeline = Pipeline([       
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),        
        ('clf', LogisticRegression(verbose=1)),
    ])
    
    # pipeline.set_params(**params)    
    vect = TfidfVectorizer(max_features=params.get('vect__max_features'), max_df=params.get('vect__max_df'))
    
    K = StratifiedKFold(n_splits=10)
    
    t0 = time()
    
    predicted_y = []
    expected_y = []    
    score_y = []
    
    for train_index, test_index in K.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        X_train = vect.fit_transform(X_train)
        X_test = vect.transform(X_test)
        
        clf = LogisticRegression(C=params.get('clf__C'), penalty=params.get('clf__penalty'), solver='liblinear')
        
        clf.fit(X_train, y_train)
        
        predicted_y.extend(clf.predict(X_test))
        expected_y.extend(y_test)
        score_y.extend(clf.predict_proba(X_test))

        ### get train score

    # print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    # print()
    
    report = pd.DataFrame(classification_report(expected_y, predicted_y, digits=5, target_names=classes_name, output_dict=True))
    report = report.transpose()
    
    return (
        report, 
        np.asarray(expected_y),
        np.asarray(predicted_y),
        np.asarray(score_y)
        )

In [8]:
def run(task, dataset_name, output = None):    
    datasets = getDatasets(task,'df', dataset_name)
    for i in datasets.iterrows():

        name = i[1]['dataset_name']
        label = task
        ds_path = i[1]['path']

        # load training and test dataframes
        training_path = ds_path + '/' + i[1]['training']        

        df_training = pd.read_csv(training_path)#, usecols=cols)        

        df_training['text'] = df_training['text'].apply(clean)
        X_train = df_training['text'].values
        y_train, n_classes, classes_name = labelEncoder(df_training[label].values)

        # del(df_training)

        # print("Dataset: {0} and task: {1}".format(name, label))

        # print("n_classes: {0}".format(n_classes))

        params = getBestParams(task, dataset_name)
        # print("params: ", params)

        report, expected_y, predicted_y, score_y = model(X_train, y_train, n_classes, classes_name, params)

        # get ROC
        dataset_name = dataset_name + '/' + baseline
        
        roc_c = ROC(expected_y, score_y, n_classes, task, dataset_name, classes_name)
        report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2

        # compute accuracy
        accuracy = accuracy_score(expected_y, predicted_y)
        report['accuracy'] = [accuracy] * (n_classes + 3)

        # compute confusion matrix
        c_matrix = confusion_matrix(expected_y, predicted_y)
        plot_confusion_matrix(c_matrix, classes_name, task, dataset_name, True)
        cm = pd.DataFrame(c_matrix, columns=classes_name, index=classes_name)

        directory = './Reports/' + task + '/' + dataset_name + '/'
        report.to_csv(directory + 'report.csv')
        cm.to_csv(directory + 'confusion_matrix.csv')    

        print(report)

        # output.put('results for {0} and {1}'.format(task, dataset_name))
        # output.put(report.to_dict())

        pass

In [None]:
import multiprocessing as mp
import random
import string

random.seed(123)

# Define an output queue
output = mp.Queue()

#task_list = ['relig','polit','education','professional','region','TI']#,'gender','age']
task_list = ['gender','age']
dataset_list = ['brmoral','b5post','esic','brblogset','enblogs','pan13_en','pan13_es','sms']

args = []
for task in task_list:
    for ds in dataset_list:
        d = getDatasets(task,'df', ds)
        if len(d.values) > 0:
            args.append([task, ds])

In [None]:
# Setup a list of processes that we want to run
processes = [mp.Process(target=run, args=(x[0], x[1], output)) for x in args]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)



Normalized confusion matrix
[[0.57142857 0.2967033  0.13186813]
 [0.2826087  0.43478261 0.2826087 ]
 [0.21686747 0.39759036 0.38554217]]
              f1-score  precision    recall  support       roc  accuracy
a17-22        0.556150   0.541667  0.571429     91.0  0.734568  0.466165
a25-29        0.416667   0.400000  0.434783     92.0  0.565655  0.466165
a31-46        0.418301   0.457143  0.385542     83.0  0.687537  0.466165
micro avg     0.466165   0.466165  0.466165    266.0  0.665088  0.466165
macro avg     0.463706   0.466270  0.463918    266.0  0.665088  0.466165
weighted avg  0.464894   0.466295  0.466165    266.0  0.665088  0.466165
Normalized confusion matrix
[[0.42016807 0.57983193]
 [0.11650485 0.88349515]]
              f1-score  precision    recall  support       roc  accuracy
F             0.518135   0.675676  0.420168    119.0  0.249164  0.713846
M             0.796499   0.725100  0.883495    206.0  0.750836  0.713846
micro avg     0.713846   0.713846  0.713846    325.0  



Normalized confusion matrix
[[0.90752688 0.09247312]
 [0.17142857 0.82857143]]
              f1-score  precision    recall  support       roc  accuracy
female        0.891235   0.875519  0.907527    465.0  0.057512   0.87362
male          0.849195   0.870871  0.828571    350.0  0.942488   0.87362
micro avg     0.873620   0.873620  0.873620    815.0  0.501737   0.87362
macro avg     0.870215   0.873195  0.868049    815.0  0.501737   0.87362
weighted avg  0.873181   0.873523  0.873620    815.0  0.501737   0.87362




Normalized confusion matrix
[[0.00582249 0.99417751]
 [0.00476623 0.99523377]]
Normalized confusion matrix
[[0.13117871 0.66730038 0.20152091]
 [0.08078335 0.68298654 0.23623011]
 [0.05149051 0.57317073 0.37533875]]
              f1-score  precision    recall  support       roc  accuracy
Feminino      0.011500   0.461538  0.005822  40189.0  0.496639  0.587265
Masculino     0.739182   0.587923  0.995234  57278.0  0.503361  0.587265
micro avg     0.587265   0.587265  0.587265  97467.0  0.500002  0.587265
macro avg     0.375341   0.524731  0.500528  97467.0  0.500002  0.587265
weighted avg  0.439134   0.535810  0.587265  97467.0  0.500002  0.587265
              f1-score  precision    recall  support       roc  accuracy
a10-25        0.197425   0.398844  0.131179    526.0  0.598086  0.434407
a26-40        0.519311   0.418919  0.682987    817.0  0.559831  0.434407
a40+          0.421613   0.480903  0.375339    738.0  0.626239  0.434407
micro avg     0.434407   0.434407  0.434407   2081.0  



Normalized confusion matrix
[[0.40552093 0.38918077 0.20529831]
 [0.17511016 0.50093603 0.3239538 ]
 [0.07572112 0.23571532 0.68856356]]
              f1-score  precision    recall  support       roc  accuracy
a17-29        0.452459   0.511685  0.405521  22460.0  0.751935  0.548159
a31-42        0.504247   0.507603  0.500936  34721.0  0.646743  0.548159
a43-80        0.641028   0.599631  0.688564  34495.0  0.783443  0.548159
micro avg     0.548159   0.548159  0.548159  91676.0  0.727382  0.548159
macro avg     0.532578   0.539640  0.531674  91676.0  0.727382  0.548159
weighted avg  0.543026   0.543231  0.548159  91676.0  0.727382  0.548159
Normalized confusion matrix
[[0.42193196 0.55741191 0.02065614]
 [0.18907239 0.76910017 0.04182744]
 [0.14457314 0.77520378 0.08022308]]
              f1-score  precision    recall  support       roc  accuracy
10s           0.508605   0.640092  0.421932   6584.0  0.682577  0.516305
20s           0.588346   0.476386  0.769100   6479.0  0.616696  0.516

In [None]:
args