In [1]:
import sklearn
print(sklearn.__version__)

0.20.0


In [2]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from time import time

import numpy as np
import pandas as pd
#from functions.models import 
from functions.datasets import getDatasets
from functions.metrics import evaluator
from functions.plot import ROC

In [3]:
dataset_name = 'pan13_en'
task = 'age'

def getBestParams(task, dataset_name):
    
    dataset_name = dataset_name.strip().lower()
    task = task.strip().lower()
    
    # load excel params
    baseline1 = pd.read_excel('./Reports/Reports.xlsx','baseline1')
    baseline1['Task'] = baseline1['Task'].str.lower()
    baseline1['Name'] = baseline1['Name'].str.lower()
    
    best_params = baseline1[(baseline1['Name'] == dataset_name) & (baseline1['Task'] == task)]
    
    max_features = best_params['max features'].values[0]
    
    model_params = {
                    'vect__max_features': max_features if max_features != 'None' and not pd.isna(max_features) else None,
                    'vect__max_df': best_params['max df'].values[0] if not pd.isna(best_params['max df'].values[0]) else 1,
                    'clf__C': best_params['C'].values[0] if not pd.isna(best_params['C'].values[0]) else 1000.0, 
                    'clf__penalty': best_params['P'].values[0] if not pd.isna(best_params['P'].values[0]) else 'l2'
                    }
    
    return model_params

getBestParams(task, dataset_name)

{'clf__C': 1000.0,
 'clf__penalty': 'l2',
 'vect__max_df': 1,
 'vect__max_features': None}

In [4]:
def labelEncoder(y):
    le = LabelEncoder()
    le.fit(y)

    return (le.transform(y), len(le.classes_), list(le.classes_))

In [5]:
def clean(text):
    space = ' '    
    for char in ['\n','-', '...', '*', '/', '+', '\\']:
        text = text.replace(char, space)
        
    text = text.replace(space*2, space)
    text = text.replace(space*3, space)
    
    return text.strip()

### Perform classification for each problem / task

In [6]:
def model(X, y, n_classes, classes_name, params):
                
    pipeline = Pipeline([       
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),        
        ('clf', LogisticRegression(verbose=1)),
    ])
    
    # pipeline.set_params(**params)    
    vect = TfidfVectorizer(max_features=params.get('vect__max_features'), max_df=params.get('vect__max_df'))
    
    K = StratifiedKFold(n_splits=10)
    
    t0 = time()
    
    predicted_y = []
    expected_y = []    
    score_y = []
    
    for train_index, test_index in K.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        X_train = vect.fit_transform(X_train)
        X_test = vect.transform(X_test)
        
        clf = LogisticRegression(C=params.get('clf__C'), penalty=params.get('clf__penalty'), solver='liblinear')
        
        clf.fit(X_train, y_train)
        
        predicted_y.extend(clf.predict(X_test))
        expected_y.extend(y_test)
        score_y.extend(clf.predict_proba(X_test))

        ### get train score

    # print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    # print()
    
    report = pd.DataFrame(classification_report(expected_y, predicted_y, digits=5, target_names=classes_name, output_dict=True))
    report = report.transpose()
    
    return (
        report, 
        np.asarray(expected_y),
        np.asarray(predicted_y),
        np.asarray(score_y)
        )

In [7]:
dataset_name = 'pan13_en'
task = 'polit'
task, dataset_name

('polit', 'pan13_en')

In [8]:
#if __name__ == '__main__':

In [9]:
def run(task, dataset_name, output):
    datasets = getDatasets(task,'df', dataset_name)
    for i in datasets.iterrows():

        name = i[1]['dataset_name']
        label = task
        ds_path = i[1]['path']

        # load training and test dataframes
        training_path = ds_path + '/' + i[1]['training']        

        df_training = pd.read_csv(training_path)#, usecols=cols)        

        df_training['text'] = df_training['text'].apply(clean)
        X_train = df_training['text'].values
        y_train, n_classes, classes_name = labelEncoder(df_training[label].values)

        # del(df_training)

        # print("Dataset: {0} and task: {1}".format(name, label))

        # print("n_classes: {0}".format(n_classes))

        params = getBestParams(task, dataset_name)
        # print("params: ", params)

        report, expected_y, predicted_y, score_y = model(X_train, y_train, n_classes, classes_name, params)

        # get ROC
        roc_c = ROC(expected_y, score_y, n_classes, task, dataset_name)    
        try:
            report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2
        except:
            print(roc_c.values())
            report['roc'] = list(roc_c.values()) + [1] * 2
            
        # compute accuracy
        accuracy = accuracy_score(expected_y, predicted_y)
        report['accuracy'] = [accuracy] * 6
        # compute confusion matrix
        cm = pd.DataFrame(confusion_matrix(expected_y, predicted_y), columns=classes_name, index=classes_name)

        directory = './Reports/' + task + '/' + dataset_name + '/'
        report.to_csv(directory + 'report.csv')
        cm.to_csv(directory + 'confusion_matrix.csv')    
        
        print(task, dataset_name, report)
        
        output.put('results for {0} and {1}'.format(task, dataset_name))
        output.put(report.to_dict())
        
        pass

In [None]:
import multiprocessing as mp
import random
import string

random.seed(123)

# Define an output queue
output = mp.Queue()

task_list = ['relig','polit','education','professional','region','TI']#,'gender','age']
dataset_list = ['brmoral','b5post','esic','brblogset','enblogs','pan13_en','pan13_es','sms']

args = []
for task in task_list:
    for ds in dataset_list:
        d = getDatasets(task,'df', ds)
        if len(d.values) > 0:
            args.append([task, ds])

In [None]:
# Setup a list of processes that we want to run
processes = [mp.Process(target=run, args=(x[0], x[1], output)) for x in args]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)

  'precision', 'predicted', average, warn_for)


education brmoral                                      f1-score  precision    recall  support  \
Básico + Superior incompleto         0.454183   0.448819  0.459677    124.0   
Pós-graduação andamento ou completo  0.398104   0.392523  0.403846    104.0   
Superior completo                    0.287234   0.296703  0.278351     97.0   
micro avg                            0.387692   0.387692  0.387692    325.0   
macro avg                            0.379841   0.379349  0.380625    325.0   
weighted avg                         0.386410   0.385404  0.387692    325.0   

                                          roc  accuracy  
Básico + Superior incompleto         0.595209  0.387692  
Pós-graduação andamento ou completo  0.579882  0.387692  
Superior completo                    0.504205  0.387692  
micro avg                            0.562087  0.387692  
macro avg                            0.562087  0.387692  
weighted avg                         0.562087  0.387692  


Process Process-9:
Traceback (most recent call last):
  File "<ipython-input-9-2ef24beade8f>", line 32, in run
    report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2
KeyError: 'macro'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-2ef24beade8f>", line 34, in run
    report['roc'] = list(roc_c.values()) + [1] * 2
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3119, in __setitem__
    self._set_item(key, value)
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3194, in _set_item
    value = self._sanitize_column(key, value)
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3391, in _sanitize_column
    value = _

relig brmoral               f1-score  precision    recall  support  roc  accuracy
r12           0.705570   0.545082  1.000000    133.0  0.5  0.545082
r3            0.000000   0.000000  0.000000     90.0  0.5  0.545082
r45           0.000000   0.000000  0.000000     21.0  0.5  0.545082
micro avg     0.545082   0.545082  0.545082    244.0  0.5  0.545082
macro avg     0.235190   0.181694  0.333333    244.0  0.5  0.545082
weighted avg  0.384594   0.297114  0.545082    244.0  0.5  0.545082
polit brmoral               f1-score  precision    recall  support       roc  accuracy
p12           0.588745   0.571429  0.607143    112.0  0.753563  0.523077
p3            0.532258   0.519685  0.545455    121.0  0.647667  0.523077
p45           0.421053   0.455696  0.391304     92.0  0.684176  0.523077
micro avg     0.523077   0.523077  0.523077    325.0  0.697439  0.523077
macro avg     0.514018   0.515603  0.514634    325.0  0.697439  0.523077
weighted avg  0.520244   0.519403  0.523077    325.0  0.69

Process Process-10:
Traceback (most recent call last):
  File "<ipython-input-9-2ef24beade8f>", line 32, in run
    report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2
KeyError: 'macro'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-9-2ef24beade8f>", line 34, in run
    report['roc'] = list(roc_c.values()) + [1] * 2
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3119, in __setitem__
    self._set_item(key, value)
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3194, in _set_item
    value = self._sanitize_column(key, value)
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py", line 3391, in _sanitize_column
    value = 