In [1]:
import sklearn
print(sklearn.__version__)

0.20.0


In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from time import time

import numpy as np
import pandas as pd
#from functions.models import 
from Models.functions.datasets import getDatasets, loadTrainTest
from Models.functions.metrics import evaluator
from Models.functions.plot import ROC, plot_confusion_matrix
# from Models.functions.preprocessing import clean

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
en_stopwords = set(stopwords.words('english'))

from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

# Synthetic Minority Oversampling Technique (SMOTE)
def oversampling(X, y):    
    X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)    
    #return X, y
    return X_resampled, y_resampled        

[nltk_data] Downloading package stopwords to /home/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rafael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
%matplotlib inline

In [22]:
dataset_name = 'brmoral'
task = 'smscorpus'

def getBestParams(task, dataset_name):
    baseline = 'baseline1'
    dataset_name = dataset_name.strip().lower()
    task = task.strip().lower()
    
    # load excel params
    baseline1 = pd.read_excel('./Reports_v1/Reports.xlsx', baseline)
    
    baseline1['Task'] = baseline1['Task'].str.lower()
    baseline1['Name'] = baseline1['Name'].str.lower()
    
    best_params = baseline1[(baseline1['Name'] == dataset_name) & (baseline1['Task'] == task)]
    
    if len(best_params) < 1: return {
                    'vect__max_features': None,
                    'vect__max_df': 1,
                    'clf__C': 1.0,
                    'clf__penalty': 'l2'
                    }
    
    max_features = best_params['max features'].values[0]
    
    model_params = {
                    'vect__max_features': max_features if max_features != 'None' and not pd.isnull(max_features) else None,
                    'vect__max_df': best_params['max df'].values[0] if not pd.isnull(best_params['max df'].values[0]) else 1,
                    'clf__C': best_params['C'].values[0] if not pd.isnull(best_params['C'].values[0]) else 1000.0, 
                    'clf__penalty': best_params['P'].values[0] if not pd.isnull(best_params['P'].values[0]) else 'l2'
                    }
    
    return model_params

getBestParams(task, dataset_name)

{'clf__C': 1.0,
 'clf__penalty': 'l2',
 'vect__max_df': 1,
 'vect__max_features': None}

In [23]:
def labelEncoder(y):
    le = LabelEncoder()
    le.fit(y)

    return (le.transform(y), len(le.classes_), list(le.classes_))

### Perform classification for each problem / task

In [33]:
def model(X_train, X_test, y_train, y_test, n_classes, classes_name, params):
    
    t0 = time()
    
    predicted_y = []
    expected_y = []    
    score_y = []

    params_grid = dict(
            clf__C = np.linspace(1e-4, 1e4, num=8),
            clf__penalty = ['l1','l2'],
            vect__max_df = [0.8, 0.9, 1.0],            
            vect__max_features = [None, 1000, 3000, 9000],
            vect__stop_words= [en_stopwords, None]
    )                               
            # vect__ngram_range = [(1, 1), (3, 5)],                    
                
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),        
        #('smote', SMOTE()),
        ('clf', LogisticRegression(verbose=1)),
    ])
    
    grid_search = GridSearchCV(pipeline, params_grid, scoring='accuracy')
    
    #print('best params', best_model.best_params_)
    #print('best scores', best_model.best_score_)
    print("Performing grid search...")    
    print("Pipeline steps:", [name for name, _ in pipeline.steps])    
    t0 = time()
    
    grid_search.fit(X_train, y_train)
    print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(params_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    y_score = grid_search.decision_function(X_test)
    
    print()
    y_pred = grid_search.predict(X_test) 
    print()

    predicted_y.extend(y_pred)
    expected_y.extend(y_test)
    score_y.extend(y_score)

    ### get train score

    # print("done in %0.2fs and %0.1fmin" % ((time() - t0), ((time() - t0) / 60) ))
    # print()
    
    report = pd.DataFrame(classification_report(expected_y, predicted_y, digits=5, target_names=classes_name, output_dict=True))
    report = report.transpose()
    
    return (
        report, 
        np.asarray(expected_y),
        np.asarray(predicted_y),
        np.asarray(score_y)
        )

In [34]:
def run(task, dataset_name, root = None):    
    datasets = getDatasets(task,'df', dataset_name, root)
    for i in datasets.iterrows():

        name = i[1]['dataset_name']
        label = task
        ds_path = i[1]['path']

        # load training and test dataframes
        training_path = ds_path + '/' + i[1]['training']        
        test_path = ds_path + '/' + i[1]['test']        

        df_training = pd.read_csv(training_path)#, usecols=cols)                
        df_test = pd.read_csv(test_path)#, usecols=cols)                
        #df_training['text'] = df_training['text'].apply(clean)
        
        X_train = df_training['text'].values
        y_train, n_classes, classes_name = labelEncoder(df_training[label].values)
        
        X_test = df_test['text'].values
        y_test, n_classes, classes_name = labelEncoder(df_test[label].values)

        params = None
        
        report, expected_y, predicted_y, score_y = model(X_train, X_test, y_train, y_test, n_classes, classes_name, params)

        # get ROC
        roc_c = ROC(expected_y, score_y, n_classes, task, dataset_name, classes_name)
        report['roc'] = list(roc_c.values()) + [roc_c['macro']] * 2

        # compute accuracy
        accuracy = accuracy_score(expected_y, predicted_y)
        report['accuracy'] = [accuracy] * (n_classes + 3)

        # compute confusion matrix
        c_matrix = confusion_matrix(expected_y, predicted_y)
        plot_confusion_matrix(c_matrix, classes_name, task, dataset_name, True)
        cm = pd.DataFrame(c_matrix, columns=classes_name, index=classes_name)

        directory = './Reports/' + task + '/' + dataset_name + '/'
        report.to_csv(directory + 'report.csv')
        cm.to_csv(directory + 'confusion_matrix.csv')    

        print(task, dataset_name, report)
        print(cm)

        # output.put('results for {0} and {1}'.format(task, dataset_name))
        # output.put(report.to_dict())

        pass

In [None]:
run("gender", "smscorpus", "/home/rafael/GDrive/Data/Dataframe/")

loading from /home/rafael/GDrive/Data/Dataframe/
Performing grid search...
Pipeline steps: ['vect', 'clf']




[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

In [28]:
task = "gender"
dataset_name = "smscorpus"
root = "/home/rafael/GDrive/Data/Dataframe/"
lang = "en"
loadTrainTest(task, dataset_name, root, lang)

IndexError: tuple index out of range

In [None]:
import multiprocessing as mp
import random
import string

random.seed(123)

# Define an output queue
output = mp.Queue()

task_list = ['relig','polit','education','professional','region','TI','gender','age']
dataset_list = ['brmoral','b5post','esic','brblogset','enblogs','pan13_en','pan13_es']

args = []
for task in task_list:
    for ds in dataset_list:
        d = getDatasets(task,'df', ds)
        if len(d.values) > 0:
            args.append([task, ds])

In [None]:
# Setup a list of processes that we want to run
processes = [mp.Process(target=run, args=(x[0], x[1], output)) for x in args]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
results = [output.get() for p in processes]

print(results)