In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
import re, string
from gensim.parsing.preprocessing import remove_stopwords

## Lectura de los datos

In [None]:
dataFake = pd.read_csv("Fake.csv")
dataFake["class"]=0
print("Fake:\n",dataFake.shape)

dataTrue = pd.read_csv("True.csv")
dataTrue["class"]=1
print("True:\n",dataTrue.shape)

data_merge = pd.concat([dataFake,dataTrue], axis=0)
data = data_merge.drop(["title","subject","date"], axis=1)
print("All data:\n",data.shape)

## Limpieza

In [75]:
'''
  Función que elimina los símbolos especiales de un texto,
  así como las stopwords
'''
def word_cleaner(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\\W', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return remove_stopwords(text)

'''
  Función que dado un texto, lo limpia y elimina las letras aisladas existentes.
'''
def text_cleaner(text, lemmatizer):
  text = word_cleaner(text)
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  text = remove_letters(lemmatized_tokens)
  text = " ".join(text)
  return text

'''
  Funcion que elimina las palabras con lengitud menor a length
'''
def remove_letters(lemas, length=2):
  return [word for word in lemas if len(word)>length]

'''
  Funcion que cuenta la frecuencia de palabras en el dataset
'''
def count_tokens(texts, wf):
  for text in texts:
    tokens = text.split()
    wf.update(tokens)
  return wf

'''
  Funcion que dado el dataset y la lista de palabras que no tienen una 
  frecuencua valida las elimina del dataset
'''
def remove_max_min_words_freq(texts, words_to_remove):
  filtered_texts = []
  for text in texts:
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in words_to_remove]
    filtered_text = ' '.join(filtered_tokens)
    filtered_texts.append(filtered_text)
  return filtered_texts

'''
  Funcion que obtiene la lista de palabras a eliminar
'''
def get_words_to_remove(min_freq =2, max_freq=1000, word_freq=None):
  return [word for word, freq in word_freq.items() if freq < min_freq or freq > max_freq]

### Importaciones para probar la eliminación de palabras con máxima y mínima frecuencia.


In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk
# Descarga de recursos
nltk.download('punkt')
nltk.download('wordnet')
# Instancias 
word_freq = Counter()
lemmatizer = WordNetLemmatizer()

In [77]:
clean_data = data.copy()

In [78]:
clean_data["text"] = clean_data['text'].apply(text_cleaner,args=(lemmatizer,))

### Solo es requerido ejecutar esta celsa si se desea probar la eliminación de palabras con frecuencia max y min.


In [None]:
head = clean_data.head(100)
tail = clean_data.tail(100)
clean_data = pd.concat([head,tail])
word_freq = count_tokens(clean_data['text'], word_freq)
words = get_words_to_remove(5,1000,word_freq)
clean_data["text"] = remove_max_min_words_freq(clean_data["text"] ,words)

## Separación de datos de entrenamiento y de prueba

In [None]:
SEED = 123456789

x = clean_data['text']
y = clean_data['class']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train)} ({len(x_train)/len(x):%})")
print(f"Datos de prueba: \t{len(x_test)} ({len(x_test)/len(x):%})")

## Preprocesamiento

### Vectorización TFID:

In [None]:
tfid_vectorizer = TfidfVectorizer()
def vectorize_TFID(x, mode="test"):
    if mode == "train":
        return tfid_vectorizer.fit_transform(x)
    elif mode == "test":
        return tfid_vectorizer.transform(x)
    return None

In [None]:
x_tfid_train = vectorize_TFID(x_train, mode="train")
x_tfid_test = vectorize_TFID(x_test, mode="test")

### Vectorización por frecuencia

In [None]:
count_vectorizer = CountVectorizer()
def vectorize_Count(x, mode="test"):
    if mode == "train":
        return count_vectorizer.fit_transform(x)
    elif mode == "test":
        return count_vectorizer.transform(x)
    return None

In [None]:
x_count_train = vectorize_Count(x_train, mode="train")
x_count_test = vectorize_Count(x_test, mode="test")

In [None]:
filas, col = x_count_train.shape
print(filas)
print(col)

## Reducción de dimensiones con PCA

Dado que KNN es un algoritmo ineficiente para vectores grandes, se generan versiones reducidas en dimensión para cada vectorización.

In [None]:
N_COMPONENTS = 500 # FIXME encontrar mejor valor de N_COMPONENTS

In [None]:
svd_tfid = TruncatedSVD(n_components=N_COMPONENTS)
x_tfid_train_svd = svd_tfid.fit_transform(x_tfid_train)
x_tfid_test_svd = svd_tfid.transform(x_tfid_test)

In [None]:
svd_count = TruncatedSVD(n_components=N_COMPONENTS)
x_count_train_svd = svd_count.fit_transform(x_count_train)
x_count_test_svd = svd_count.transform(x_count_test)

## Ajuste de modelos

### Regresión logística sin SVD

Se ajusta un modelo de regresión logística a los vectores generados por TFID.

In [None]:
LR_tfid = LogisticRegression(max_iter=1000)
LR_tfid.fit(x_tfid_train, y_train)
print(f"El algoritmo convergió después de {LR_tfid.n_iter_} iteraciones")


Se ajusta un modelo de regresión logística a los vectores generados por frecuencias de palabras.

In [None]:
LR_count = LogisticRegression(max_iter=2000)
LR_count.fit(x_count_train, y_train)
print(f"El algoritmo convergió después de {LR_count.n_iter_} iteraciones")

El algoritmo convergió después de [132] iteraciones


### Regresión Logísitca con SVD

Se ajusta un modelo de regresión logística a los vectores generados por TFID y procesados por SVD.

In [None]:
LR_tfid_svd = LogisticRegression(max_iter=1000)
LR_tfid_svd.fit(x_tfid_train_svd, y_train)
print(f"El algoritmo convergió después de {LR_tfid_svd.n_iter_} iteraciones")


El algoritmo convergió después de [34] iteraciones


Se ajusta un modelo de regresión logística a los vectores generados por frecuencias de palabras y procesados por SVD.

In [None]:
LR_count_svd = LogisticRegression(max_iter=2000)
LR_count_svd.fit(x_count_train_svd, y_train)
print(f"El algoritmo convergió después de {LR_count_svd.n_iter_} iteraciones")

El algoritmo convergió después de [202] iteraciones


### K Nearest Neighbors con SVD

Se obtiene un valor de $K$ sensato para los modelos.

In [None]:
K = 5 # FIXME Encontrar mejor valor de K
N_JOBS = 4

Se ajusta un modelo KNN a los vectores generados por TFID.

In [None]:
KNN_tfid_svd = KNeighborsClassifier(n_neighbors=K, n_jobs=N_JOBS)
KNN_tfid_svd.fit(x_tfid_train_svd, y_train)

Se ajusta un modelo KNN a los vectores generados por frecuencias de palabras.

In [None]:
KNN_count_svd = KNeighborsClassifier(n_neighbors=K, n_jobs=N_JOBS)
KNN_count_svd.fit(x_count_train_svd, y_train)

## Evaluación de los modelos

In [None]:
class TrainedModel:
    def __init__(self, model, name, x_test, y_test) -> None:
        self.model = model
        self.name = name
        self.x_test = x_test
        self.y_test = y_test
        self.predict = model.predict(x_test)


In [None]:
LR_tfid_tm = TrainedModel(LR_tfid, "LR + TFID", x_tfid_test, y_test)

In [None]:
LR_count_tm = TrainedModel(LR_tfid, "LR + Count", x_count_test, y_test)

In [None]:
LR_tfid_svd_tm = TrainedModel(LR_tfid_svd, f"LR + TFID + SVD ({N_COMPONENTS})", x_tfid_test_svd, y_test)

In [None]:
LR_count_svd_tm = TrainedModel(LR_count_svd, f"LR + Count + SVD ({N_COMPONENTS})", x_count_test_svd, y_test)

In [None]:
KNN_tfid_svd_tm = TrainedModel(KNN_tfid_svd, f"KNN ({KNN_tfid_svd.n_neighbors}) + TFID + SVD ({N_COMPONENTS})", x_tfid_test_svd, y_test)

In [None]:
KNN_count_svd_tm = TrainedModel(KNN_count_svd, f"KNN ({KNN_count_svd.n_neighbors}) + Count + SVD ({N_COMPONENTS})", x_count_test_svd, y_test)

In [None]:
trained_models = [
    LR_tfid_tm, LR_tfid_tm, LR_tfid_svd_tm, LR_count_svd_tm, KNN_tfid_svd_tm, KNN_count_svd_tm
]
labels = [0, 1]
target_names = ["Fake", "True"]

### Reportes de clasificación

In [None]:
def plot_clasification_reports(trained_models, labels, target_names, cols=2):
    # Se generan los reportes
    reports = [
        pd.DataFrame(
            classification_report(
                m.y_test,
                m.predict,
                labels=labels,
                target_names=target_names,
                output_dict=True,
            )
        )
        for m in trained_models
    ]

    # Se obtiene la norma de colores de todos los modelos
    # https://stackoverflow.com/a/70517313/15217078
    values = np.hstack([d.iloc[:-1, :].values.ravel() for d in reports])
    norm = mcolors.Normalize(values.min(), values.max())

    # Se generan las gráficas de los reportes
    # # https://stackoverflow.com/a/58948133/15217078
    rows = int(np.ceil(len(trained_models) / cols))
    fig, axes = plt.subplots(rows, cols)
    fig.set_size_inches(8 * cols, 5 * rows)
    for i, m in enumerate(trained_models):
        ax = axes[i // cols][i % cols]
        r = reports[i]
        sns.heatmap(r.iloc[:-1, :].T, annot=True, norm=norm, ax=ax)
        ax.set_title(m.name)
    return fig, axes


report_fig, report_axes = plot_clasification_reports(
    trained_models, labels, target_names
)


### Prueba para clasificación usando mezcla de vectorizaciones (Word2Vec y CountVectorizer)

In [80]:
from gensim.models import Word2Vec

In [None]:
# Datos de ejemplo
lst = x_train.tolist()
text_data = lst[:50]
labels = y_train.tolist()
labels = labels[:50]

In [81]:
# Vectorización con CountVectorizer
count_vectorizer = CountVectorizer()
count_vectors = count_vectorizer.fit_transform(text_data).toarray()

In [82]:
# Vectorización con WordEmbeddings (Word2Vec)
word2vec_model = Word2Vec([text.split() for text in text_data], min_count=1)
embedding_vectors = np.array([np.mean([word2vec_model.wv[word] for word in text.split()], axis=0) for text in text_data])

In [None]:
# Combinar las vectorizaciones
combined_vectors = np.concatenate((count_vectors, embedding_vectors), axis=1)

# Entrenar el modelo de regresión logística
logistic_regression = LogisticRegression()
logistic_regression.fit(combined_vectors, labels)

In [None]:
# Ejemplo de predicción
new_text = 'This is a fucking fake new.'
# Preporcesamiento de texto de prueba
new_count_vector = count_vectorizer.transform([new_text]).toarray()
new_embedding_vector = np.mean([word2vec_model.wv[word] for word in new_text.split()], axis=0)
new_combined_vector = np.concatenate((new_count_vector, np.array([new_embedding_vector])), axis=1)
prediction = logistic_regression.predict(new_combined_vector)

print(f'La predicción para el nuevo texto "{new_text}" es: {prediction}')