In [3]:
import sklearn
print(sklearn.__version__)

0.20.0


In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from time import time

import numpy as np
import pandas as pd
#from functions.models import 
from Models.functions.datasets import getDatasets, loadTrainTest
from Models.functions.metrics import evaluator
from Models.functions.plot import ROC, plot_confusion_matrix
from Models.functions.preprocessing import clean, labelEncoder
from Models.functions.utils import checkFolder

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):    
    X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)    
    #return X, y
    return X_resampled, y_resampled        

In [5]:
%matplotlib inline

In [6]:
def getBestParams(task, dataset_name):
    baseline = 'baseline1'
    dataset_name = dataset_name.strip().lower()
    task = task.strip().lower()
    
    # load excel params
    baseline1 = pd.read_excel('./best_params.xlsx', baseline)
    
    baseline1['Task'] = baseline1['Task'].str.lower()
    baseline1['Name'] = baseline1['Name'].str.lower()
    
    best_params = baseline1[(baseline1['Name'] == dataset_name) & (baseline1['Task'] == task)]
    
    
    if len(best_params) < 1: return dict(
                clf__C =  1428.5715142857143,
                clf__penalty =  'l2',
                vect__max_df =  0.8,
                vect__max_features =  None,
                vect__stop_words = None)
    
    max_features = best_params['max features'].values[0]
    
    model_params = {
                    'vect__max_features': max_features if max_features != 'None' and not pd.isnull(max_features) else None,
                    'vect__max_df': best_params['max df'].values[0] if not pd.isnull(best_params['max df'].values[0]) else 1,
                    'clf__C': best_params['C'].values[0] if not pd.isnull(best_params['C'].values[0]) else 1000.0, 
                    'clf__penalty': best_params['P'].values[0] if not pd.isnull(best_params['P'].values[0]) else 'l2'
                    }
    
    return model_params

### Perform classification for each problem / task

In [7]:
def model(X, y, n_classes, classes_name, params):
    
    # pipeline.set_params(**params)    
    vect = TfidfVectorizer(max_features=params.get('vect__max_features'), max_df=params.get('vect__max_df'))
    
    K = StratifiedKFold(n_splits=10)
    
    t0 = time()
    
    predicted_y = []
    expected_y = []    
    score_y = []
    
    for train_index, test_index in K.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        X_train = vect.fit_transform(X_train)
        X_test = vect.transform(X_test)
        
        X_train, y_train = oversampling(X_train, y_train)
        X_test, y_test = oversampling(X_test, y_test)

        clf = LogisticRegression(C=params.get('clf__C'), penalty=params.get('clf__penalty'), solver='liblinear')
        
        clf.fit(X_train, y_train)
        
        predicted_y.extend(clf.predict(X_test))
        expected_y.extend(y_test)
        score_y.extend(clf.predict_proba(X_test))

        ### get train score

    # print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    # print()
    
    report = pd.DataFrame(classification_report(expected_y, predicted_y, digits=5, target_names=classes_name, output_dict=True))
    report = report.transpose()
    
    return (
        report, 
        np.asarray(expected_y),
        np.asarray(predicted_y),
        np.asarray(score_y)
        )

In [16]:
def run(task, dataset_name, root, lang):    
    
    directory = './Reports/' + task + '/' + dataset_name + '_' + lang + '/'
    checkFolder(directory)
    
    X, _, y, _ = loadTrainTest(task, dataset_name, root, lang)
    y, n_classes, classes_name = labelEncoder(y)    
    
    params = getBestParams(task, dataset_name)    
    print("params: ", params)

    report, expected_y, predicted_y, score_y = model(X, y, n_classes, classes_name, params)

    # get ROC
    roc_c = ROC(expected_y, score_y, n_classes, task, dataset_name+'_'+lang, classes_name)
    report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2

    # compute accuracy
    accuracy = accuracy_score(expected_y, predicted_y)
    report['accuracy'] = [accuracy] * (n_classes + 3)

    # compute confusion matrix
    c_matrix = confusion_matrix(expected_y, predicted_y)
    plot_confusion_matrix(c_matrix, classes_name, task, dataset_name+'_'+lang, True)
    cm = pd.DataFrame(c_matrix, columns=classes_name, index=classes_name)
    
    report.to_csv(directory + 'report.csv')
    cm.to_csv(directory + 'confusion_matrix.csv')
    np.save(directory + '/expected_y.numpy', expected_y)
    np.save(directory + '/predicted_y.numpy', predicted_y)
    np.save(directory + '/score_y.numpy', score_y)
        
    print(task, dataset_name)
    print(display(report))
    print()
    print(cm)
    pass

In [17]:
run("gender", "b5post", "/home/rafael//USP/drive/Data/Dataframe/", 'pt')

params:  {'vect__max_features': None, 'vect__max_df': 0.9, 'clf__C': 1428.57, 'clf__penalty': 'l2'}
Normalized confusion matrix
[[0.89892473 0.10107527]
 [0.14623656 0.85376344]]
gender b5post


Unnamed: 0,f1-score,precision,recall,support,roc,accuracy
female,0.879075,0.860082,0.898925,465.0,0.053181,0.876344
male,0.873487,0.894144,0.853763,465.0,0.946819,0.876344
micro avg,0.876344,0.876344,0.876344,930.0,0.501522,0.876344
macro avg,0.876281,0.877113,0.876344,930.0,0.501522,0.876344
weighted avg,0.876281,0.877113,0.876344,930.0,0.501522,0.876344


None

        female  male
female     418    47
male        68   397


<matplotlib.figure.Figure at 0x7fc4b58d50b8>

<matplotlib.figure.Figure at 0x7fc4a2e26080>

<matplotlib.figure.Figure at 0x7fc4a2c1cda0>

In [15]:
run("gender", "smscorpus", "/home/rafael//USP/drive/Data/Dataframe/", 'en')

params:  {'clf__C': 1428.5715142857143, 'clf__penalty': 'l2', 'vect__max_df': 0.8, 'vect__max_features': None, 'vect__stop_words': None}


KeyboardInterrupt: 

In [None]:
task = "gender"
dataset_name = "smscorpus"
root = "/home/rafael/GDrive/Data/Dataframe/"
lang = "en"
X, _, _, _ = loadTrainTest(task, dataset_name, root, lang)

X.

In [None]:
exit(0)

In [None]:
import multiprocessing as mp
import random
import string

random.seed(123)

# Define an output queue
output = mp.Queue()

task_list = ['relig','polit','education','professional','region','TI','gender','age']
dataset_list = ['brmoral','b5post','esic','brblogset','enblogs','pan13_en','pan13_es']

args = []
for task in task_list:
    for ds in dataset_list:
        d = getDatasets(task,'df', ds)
        if len(d.values) > 0:
            args.append([task, ds])

In [None]:
# Setup a list of processes that we want to run
processes = [mp.Process(target=run, args=(x[0], x[1], output)) for x in args]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)