In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import re, string
from gensim.parsing.preprocessing import remove_stopwords

## Lectura de los datos

In [None]:
dataFake = pd.read_csv("Fake.csv")
dataFake["class"]=0
print(dataFake.shape)

dataTrue = pd.read_csv("True.csv")
dataTrue["class"]=1
print(dataTrue.shape)

data_merge = pd.concat([dataFake,dataTrue], axis=0)
data = data_merge.drop(["title","subject","date"], axis=1)
print(data.shape)

In [None]:
data.head()

## Limpieza

In [None]:
def word_cleaner(text):
  text = text. lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\\W', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return remove_stopwords(text)  

In [None]:
clean_data = data.copy()
clean_data["text"]= clean_data['text'].apply(word_cleaner)

In [None]:
clean_data.head()

## Separación de datos de entrenamiento y de prueba

In [None]:
SEED = 123456789

x = clean_data['text']
y = clean_data['class']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train)} ({len(x_train)/len(x):%})")
print(f"Datos de prueba: \t{len(x_test)} ({len(x_test)/len(x):%})")

## Preprocesamiento

### Vectorización TFID:

In [None]:
tfid_vectorizer = TfidfVectorizer()
def vectorize_TFID(x, mode="test"):
    if mode == "train":
        return tfid_vectorizer.fit_transform(x)
    elif mode == "test":
        return tfid_vectorizer.transform(x)
    return None

In [None]:
x_tfid_train = vectorize_TFID(x_train, mode="train")
x_tfid_test = vectorize_TFID(x_test, mode="test")

### Vectorización por frecuencia

In [None]:
count_vectorizer = CountVectorizer()
def vectorize_Count(x, mode="test"):
    if mode == "train":
        return count_vectorizer.fit_transform(x)
    elif mode == "test":
        return count_vectorizer.transform(x)
    return None

In [None]:
x_count_train = vectorize_Count(x_train, mode="train")
x_count_test = vectorize_Count(x_test, mode="test")

## Ajuste de modelos

### Regresión logística

Se ajusta un modelo de regresión logística a los vectores generados por TFID.

In [None]:
LR_tfid = LogisticRegression(max_iter=1000)
LR_tfid.fit(x_tfid_train, y_train)
print(f"El algoritmo convergió después de {LR_tfid.n_iter_} iteraciones")


Se ajusta un modelo de regresión logística a los vectores generados por frecuencias de palabras.

In [None]:
LR_count = LogisticRegression(max_iter=2000)
LR_count.fit(x_count_train, y_train)
print(f"El algoritmo convergió después de {LR_count.n_iter_} iteraciones")

### K Nearest Neighbors 

Se obtiene un valor de $K$ sensato para los modelos.

In [None]:
K = 5 # FIXME Encontrar mejor valor de K
N_JOBS = 4

Se ajusta un modelo KNN a los vectores generados por TFID.

In [None]:
KNN_tfid = KNeighborsClassifier(n_neighbors=K, n_jobs=N_JOBS)
KNN_tfid.fit(x_tfid_train, y_train)

Se ajusta un modelo KNN a los vectores generados por frecuencias de palabras.

In [None]:
KNN_count = KNeighborsClassifier(n_neighbors=K, n_jobs=N_JOBS)
KNN_count.fit(x_count_train, y_train)

## Evaluación de los modelos

In [None]:
class TrainedModel:
    def __init__(self, model, name, x_test, y_test) -> None:
        self.model = model
        self.name = name
        self.x_test = x_test
        self.y_test = y_test
        self.predict = model.predict(x_test)


In [None]:
LR_tfid_tm = TrainedModel(LR_tfid, "LR + TFID", x_tfid_test, y_test)

In [None]:
LR_count_tm = TrainedModel(LR_count, "LR + Count", x_count_test, y_test)

In [None]:
KNN_tfid_tm = TrainedModel(KNN_tfid, f"KNN ({KNN_tfid.n_neighbors}) + TFID", x_tfid_test, y_test)

In [None]:
KNN_count_tm = TrainedModel(KNN_count, f"KNN ({KNN_count.n_neighbors}) + Count", x_count_test, y_test)

In [None]:
trained_models = [
    LR_tfid_tm, LR_count_tm, KNN_tfid_tm, KNN_count_tm
]
labels = [0, 1]
target_names = ["Fake", "True"]

### Reportes de clasificación

In [None]:
def plot_clasification_reports(trained_models, labels, target_names, rows=2, cols=2):
    # Se generan los reportes
    reports = [
        pd.DataFrame(
            classification_report(
                m.y_test,
                m.predict,
                labels=labels,
                target_names=target_names,
                output_dict=True,
            )
        ) for m in trained_models
    ]

    # Se obtiene la norma de colores de todos los modelos
    # https://stackoverflow.com/a/70517313/15217078
    values = np.hstack([d.iloc[:-1, :].values.ravel() for d in reports])
    norm = mcolors.Normalize(values.min(), values.max())

    # Se generan las gráficas de los reportes
    # # https://stackoverflow.com/a/58948133/15217078
    fig, axes = plt.subplots(rows, cols)
    fig.set_size_inches(14, 9)
    for i, m in enumerate(trained_models):
        ax = axes[i // rows][i % cols]
        r = reports[i]
        sns.heatmap(r.iloc[:-1, :].T, annot=True, norm=norm, ax=ax)
        ax.set_title(m.name)
    return fig, axes

report_fig, report_axes = plot_clasification_reports(trained_models, labels, target_names)
