In [1]:
import sklearn
print(sklearn.__version__)

0.20.0


In [3]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from time import time

import numpy as np
import pandas as pd
#from functions.models import 
from Models.functions.datasets import getDatasets, loadTrainTest
from Models.functions.metrics import evaluator
from Models.functions.plot import ROC, plot_confusion_matrix
# from Models.functions.preprocessing import clean

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):    
    X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)    
    #return X, y
    return X_resampled, y_resampled        

In [4]:
%matplotlib inline

In [29]:
dataset_name = 'brmoral'
task = 'smscorpus'

def getBestParams(task, dataset_name):
    baseline = 'baseline1'
    dataset_name = dataset_name.strip().lower()
    task = task.strip().lower()
    
    # load excel params
    baseline1 = pd.read_excel('./Reports_v1/Reports.xlsx', baseline)
    
    baseline1['Task'] = baseline1['Task'].str.lower()
    baseline1['Name'] = baseline1['Name'].str.lower()
    
    best_params = baseline1[(baseline1['Name'] == dataset_name) & (baseline1['Task'] == task)]
    
    
    if len(best_params) < 1: return dict(
                clf__C =  1428.5715142857143,
                clf__penalty =  'l2',
                vect__max_df =  0.8,
                vect__max_features =  1000,
                vect__stop_words = None)
    
    max_features = best_params['max features'].values[0]
    
    model_params = {
                    'vect__max_features': max_features if max_features != 'None' and not pd.isnull(max_features) else None,
                    'vect__max_df': best_params['max df'].values[0] if not pd.isnull(best_params['max df'].values[0]) else 1,
                    'clf__C': best_params['C'].values[0] if not pd.isnull(best_params['C'].values[0]) else 1000.0, 
                    'clf__penalty': best_params['P'].values[0] if not pd.isnull(best_params['P'].values[0]) else 'l2'
                    }
    
    return model_params

getBestParams(task, dataset_name)

{'clf__C': 1428.5715142857143,
 'clf__penalty': 'l2',
 'vect__max_df': 0.8,
 'vect__max_features': 1000,
 'vect__stop_words': None}

In [30]:
def labelEncoder(y):
    le = LabelEncoder()
    le.fit(y)

    return (le.transform(y), len(le.classes_), list(le.classes_))

### Perform classification for each problem / task

In [31]:
def model(X, y, n_classes, classes_name, params):
    
    # pipeline.set_params(**params)    
    vect = TfidfVectorizer(max_features=params.get('vect__max_features'), max_df=params.get('vect__max_df'))
    
    K = StratifiedKFold(n_splits=10)
    
    t0 = time()
    
    predicted_y = []
    expected_y = []    
    score_y = []
    
    for train_index, test_index in K.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        X_train = vect.fit_transform(X_train)
        X_test = vect.transform(X_test)
        
        X_train, y_train = oversampling(X_train, y_train)
        X_test, y_test = oversampling(X_test, y_test)

        clf = LogisticRegression(C=params.get('clf__C'), penalty=params.get('clf__penalty'), solver='liblinear')
        
        clf.fit(X_train, y_train)
        
        predicted_y.extend(clf.predict(X_test))
        expected_y.extend(y_test)
        score_y.extend(clf.predict_proba(X_test))

        ### get train score

    # print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    # print()
    
    report = pd.DataFrame(classification_report(expected_y, predicted_y, digits=5, target_names=classes_name, output_dict=True))
    report = report.transpose()
    
    return (
        report, 
        np.asarray(expected_y),
        np.asarray(predicted_y),
        np.asarray(score_y)
        )

In [32]:
def run(task, dataset_name, root = None):    
    datasets = getDatasets(task,'df', dataset_name, root)
    for i in datasets.iterrows():

        name = i[1]['dataset_name']
        label = task
        ds_path = i[1]['path']

        # load training and test dataframes
        training_path = ds_path + '/' + i[1]['training']        

        df_training = pd.read_csv(training_path)#, usecols=cols)        
        
        #df_training['text'] = df_training['text'].apply(clean)
        
        X_train = df_training['text'].values
        y_train, n_classes, classes_name = labelEncoder(df_training[label].values)

        # del(df_training)

        # print("Dataset: {0} and task: {1}".format(name, label))

        # print("n_classes: {0}".format(n_classes))

        params = getBestParams(task, dataset_name)
        if dataset_name == 'b5post' and task == 'relig':        
            #params['clf__C'] = 1000            
            #params['clf__penalty'] = 'l1'
            #params['clf__maxiter'] = '500'
            pass
        print("params: ", params)
        
        report, expected_y, predicted_y, score_y = model(X_train, y_train, n_classes, classes_name, params)

        # get ROC
        roc_c = ROC(expected_y, score_y, n_classes, task, dataset_name, classes_name)
        report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2

        # compute accuracy
        accuracy = accuracy_score(expected_y, predicted_y)
        report['accuracy'] = [accuracy] * (n_classes + 3)

        # compute confusion matrix
        c_matrix = confusion_matrix(expected_y, predicted_y)
        plot_confusion_matrix(c_matrix, classes_name, task, dataset_name, True)
        cm = pd.DataFrame(c_matrix, columns=classes_name, index=classes_name)

        directory = './Reports/' + task + '/' + dataset_name + '/'
        report.to_csv(directory + 'report.csv')
        cm.to_csv(directory + 'confusion_matrix.csv')    

        print(task, dataset_name, report)
        print(cm)

        # output.put('results for {0} and {1}'.format(task, dataset_name))
        # output.put(report.to_dict())

        pass

In [33]:
run("gender", "smscorpus", "/home/rafael/GDrive/Data/Dataframe/")

loading from /home/rafael/GDrive/Data/Dataframe/
params:  {'vect__stop_words': None, 'vect__max_features': 1000, 'clf__C': 1428.5715142857143, 'vect__max_df': 0.8, 'clf__penalty': 'l2'}
Normalized confusion matrix
[[0.76375291 0.23624709]
 [0.29780041 0.70219959]]
gender smscorpus               f1-score  precision    recall  support       roc  accuracy
female        0.740949   0.719467  0.763753  22777.0  0.192997  0.732976
macro avg     0.732723   0.733862  0.732976  45554.0  0.807003  0.732976
male          0.724497   0.748257  0.702200  22777.0  0.500029  0.732976
micro avg     0.732976   0.732976  0.732976  45554.0  0.500029  0.732976
weighted avg  0.732723   0.733862  0.732976  45554.0  0.500029  0.732976
        female   male
female   17396   5381
male      6783  15994


<matplotlib.figure.Figure at 0x7fb8b842cd68>

<matplotlib.figure.Figure at 0x7fb8748fa080>

<matplotlib.figure.Figure at 0x7fb87472ae10>

In [1]:
task = "gender"
dataset_name = "smscorpus"
root = "/home/rafael/GDrive/Data/Dataframe/"
lang = "en"
loadTrainTest(task, dataset_name, root, lang)

NameError: name 'loadTrainTest' is not defined

In [None]:
import multiprocessing as mp
import random
import string

random.seed(123)

# Define an output queue
output = mp.Queue()

task_list = ['relig','polit','education','professional','region','TI','gender','age']
dataset_list = ['brmoral','b5post','esic','brblogset','enblogs','pan13_en','pan13_es']

args = []
for task in task_list:
    for ds in dataset_list:
        d = getDatasets(task,'df', ds)
        if len(d.values) > 0:
            args.append([task, ds])

In [None]:
# Setup a list of processes that we want to run
processes = [mp.Process(target=run, args=(x[0], x[1], output)) for x in args]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)