In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import RandomOverSampler

In [None]:
import warnings

def warn(*args, **kwargs):
    pass

warnings.warn = warn

In [12]:
import pandas as pd
from tqdm import tqdm
import spacy
#from dframcy import DframCy

tqdm.pandas()

In [17]:
df = pd.read_csv("data/dataset-loslagos-8months.csv", 
                 sep=";", 
                 engine='python')
df.columns = ["date", "media_outlet", "url", "title", "text"]
df.dropna()
print(len(df))

29598


In [18]:
df.head()

Unnamed: 0,date,media_outlet,url,title,text
0,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/reconocen-a-g...,Reconocen a guardaparques de la Región de Los ...,Distintos protagonistas de los parques naciona...
1,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/con-nuevos-ma...,Con nuevos materiales comienza plan piloto en ...,Centro de negocios Sercotec coordina acuerdos ...
2,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/centro-de-sal...,Centro de Salud Familiar CESFAM Puerto Varas i...,Las horas se solicitan en el SOME o bien a tra...
3,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/alcalde-tomas...,Alcalde Tomás Gárate presidió por primera vez ...,Los y las consejeras destacaron el hecho de vo...
4,2021-10-01,elheraldoaustral,https://www.eha.cl/noticia/local/galeria-de-ar...,Galería de Arte Machacoya realizará remate de ...,"Hoy viernes a las 18:30 horas, en Machacoya At..."


## IDENTIFICAR CATEGORIAS - ETIQUETAR DATOS

In [39]:
#df.media_outlet.value_counts()

In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import numpy as np
from sklearn import metrics

In [20]:
def clf_pred_pipeline(clf, x_train, y_train, x_test, y_test):
    print(type(clf).__name__)
    text_clf = Pipeline([
        ('vect', TfidfVectorizer(lowercase=False, tokenizer=None)),
        ('clf', clf),
    ])
    text_clf.fit(x_train.data, y_train)
    predicted = text_clf.predict(x_test.data)
    print(metrics.classification_report(y_test, predicted))
    metrics.confusion_matrix(y_test, predicted)
    mean = np.mean(predicted == y_test)
    print("mean: ", mean)
    return mean

In [None]:
from sklearn.model_selection import GridSearchCV
from time import time

clf = SGDClassifier()

sgd_losses = [
            'hinge', 'log_loss',
            'modified_huber', 'squared_hinge',
            'perceptron', 'squared_error', 'huber',
            'epsilon_insensitive', 'squared_epsilon_insensitive'
]

sgd_params = {
    'penalty':['l2', 'l1', 'elasticnet'],
    'alpha':np.power(10, np.arange(-5, 1, dtype=float)),
    'max_iter':np.linspace(10, 100, 10),
    'tol': np.power(10, np.arange(-5, 1, dtype=float)),
    'loss': sgd_losses
}

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=None)
X_train_counts = vectorizer.fit_transform(X.data)

## 1. SGD

In [None]:
grid_search_sgd = GridSearchCV(clf, param_grid=sgd_params, verbose=0)

start = time()
grid_search_sgd.fit(X_train_counts, y)

print(
    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search_sgd.cv_results_["params"]))
)
report(grid_search_sgd.cv_results_)

#### Obtención de mejores hiperparámetros

In [None]:
grid_search_sgd.best_params_

#### Clasificación SGD

In [None]:
sgd = SGDClassifier(**grid_search_sgd.best_params_)
clf_pred_pipeline(sgd, x_train, y_train, x_test, y_test)

## 2. RandomForest

In [None]:
r_clf = RandomForestClassifier()

rfc_params = {
    'n_estimators' : np.linspace(50, 200, num=5, dtype=int),
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : np.linspace(50, 200, num=5, dtype=int),
    'max_features' : ['sqrt', 'log2', None],
}

In [None]:
grid_search_rfc = GridSearchCV(r_clf, param_grid=rfc_params, verbose=0)

start = time()
grid_search_rfc.fit(X_train_counts, y)
print(
    "GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search_rfc.cv_results_["params"]))
)
report(grid_search_rfc.cv_results_)

#### Obtención de mejores hiperparámetros

In [None]:
grid_search_rfc.best_params_

#### Clasificación RandomForest

In [None]:
rfc = RandomForestClassifier(**grid_search_rfc.best_params_)
clf_pred_pipeline(rfc, x_train, y_train, x_test, y_test)

## Métricas de evaluación

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, labels, cmap=plt.cm.Blues):
    fig, ax = plt.subplots(figsize=(7, 7), tight_layout=True)
    ax.imshow(cm, interpolation='nearest', cmap=cmap)
    for i in range(cm.shape[1]):
        for j in range(cm.shape[0]):
            ax.text(j, i, "{:,}".format(cm[i, j]), 
                    horizontalalignment="center", verticalalignment="center",
                    color="white" if cm[i, j] > np.amax(cm)/2 else "black")
    ax.set_title("Matriz de confusión")
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation='vertical')
    plt.yticks(tick_marks, labels)
    plt.ylabel('Etiqueta real')
    plt.xlabel('Predicción')
    plt.show()

In [None]:
validation = vectorizer.transform(x_valid.data)
y_pred = grid_search_rfc.best_estimator_.predict(validation)

In [None]:
cm = confusion_matrix(y_true=y_valid, y_pred=y_pred)
plot_confusion_matrix(cm, labels=x_train['Tipo Movimiento'].value_counts().keys()) #No estoy seguro si son las labels correctas
print(classification_report(y_valid, y_pred, digits=3))