### Librerias Usadas.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD, PCA
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
from scipy.sparse import csr_matrix, hstack
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import re, string, nltk
from wordcloud import WordCloud
# Descarga de recursos
nltk.download('wordnet')
nltk.download('punkt')
tf.test.gpu_device_name()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Lectura de los datos

In [None]:
dataFake = pd.read_csv("/content/drive/MyDrive/Fake.csv")
dataFake["class"] = 0
print("Fake: ",dataFake.shape)

dataTrue = pd.read_csv("/content/drive/MyDrive/True.csv")
dataTrue["class"] = 1
print("True: ",dataTrue.shape)

data_merge = pd.concat([dataFake,dataTrue], axis=0)
data = data_merge.drop(["title","subject","date"], axis=1)
print("All data: ",data.shape)

## Limpieza

In [None]:
'''
  Función que elimina los símbolos especiales de un texto,
  así como las stopwords
'''
def word_cleaner(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\\W', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return remove_stopwords(text)

'''
  Función que dado un texto, lo limpia y elimina las letras aisladas existentes.
'''
def text_cleaner(text, lemmatizer):
  text = word_cleaner(text)
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  text = remove_letters(lemmatized_tokens)
  text = " ".join(text)
  return text

'''
  Funcion que elimina las palabras con lengitud menor a length
'''
def remove_letters(lemas, length=2):
  return [word for word in lemas if len(word)>length]

'''
  Funcion que cuenta la frecuencia de palabras en el dataset
'''
def count_tokens(texts, wf):
  for text in texts:
    tokens = text.split()
    wf.update(tokens)
  return wf

'''
  Funcion que dado el dataset y la lista de palabras que no tienen una 
  frecuencua valida las elimina del dataset
'''
def remove_max_min_words_freq(texts, words_to_remove):
  filtered_texts = []
  for text in texts:
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in words_to_remove]
    filtered_text = ' '.join(filtered_tokens)
    filtered_texts.append(filtered_text)
  return filtered_texts

'''
  Funcion que obtiene la lista de palabras a eliminar
'''
def get_words_to_remove(min_freq =2, max_freq=1000, word_freq=None):
  return [word for word, freq in word_freq.items() if freq < min_freq or freq > max_freq]

# Gráficas

In [None]:
def generate_word_cloud(word_list):

    # Convert the list of words into a string
    text = ' '.join(word_list)

    # Create the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    # Display the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
def generate_pie_chart(data, title):
    labels, porcentages = [], []
    
    for label, p in data:
        labels.append(label)
        porcentages.append(p)
        
    plt.pie(porcentages, labels=labels, autopct='%1.1f%%')
    plt.axis('equal')  
    plt.title(title)
    plt.show()

### Aplicación de limpieza a datos



In [None]:
clean_data = data.copy()

In [None]:
lemmatizer = WordNetLemmatizer()
clean_data["text"] = clean_data['text'].apply(text_cleaner, args=(lemmatizer,))

In [None]:
clean_fake = dataFake.copy()
lemmatizer = WordNetLemmatizer()
clean_fake['text'] = clean_fake['text'].apply(text_cleaner, args=(lemmatizer,))

In [None]:
clean_true = dataTrue.copy()
lemmatizer = WordNetLemmatizer()
clean_true['text'] = clean_true['text'].apply(text_cleaner, args=(lemmatizer,))

## Separación de datos de entrenamiento y de prueba

In [None]:
SEED = 123456789

x = clean_data['text']
y = clean_data['class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train)} ({len(x_train)/len(x):%})")
print(f"Datos de prueba: \t{len(x_test)} ({len(x_test)/len(x):%})")

In [None]:
SEED = 123456789

x_true = clean_true['text']
y_true = clean_true['class']

x_train_true, x_test_true, y_train_true, y_test_true = train_test_split(x_true, y_true, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_true)} ({len(x_train_true)/len(x_true):%})")
print(f"Datos de prueba: \t{len(x_test_true)} ({len(x_test_true)/len(x_true):%})")

In [None]:
x_fake = clean_fake['text']
y_fake = clean_fake['class']

x_train_fake, x_test_fake, y_train_fake, y_test_fake = train_test_split(x_fake, y_fake, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_fake)} ({len(x_train_fake)/len(x_fake):%})")
print(f"Datos de prueba: \t{len(x_test_fake)} ({len(x_test_fake)/len(x_fake):%})")

### Word2Vec


In [None]:
X = pd.concat([x_train_true, x_test_true], axis=0)
sentences = [text.split() for text in X]
word2vec_model = Word2Vec(sentences, min_count=2)

# 25 palabras más comunes en True.csv

In [None]:
generate_word_cloud(word2vec_model.wv.index_to_key[:25])

In [None]:
X_fake = pd.concat([x_train_fake, x_test_fake], axis=0)
sentences_fake = [text.split() for text in X_fake]
word2vec_model_f = Word2Vec(sentences_fake, min_count=2)

# 25 palabras más comunes en Fake.csv

In [None]:
generate_word_cloud(word2vec_model_f.wv.index_to_key[:25])

Usando la función `most_similar`, buscamos para cada modelo las palabras relacionadas a una de las que más repeticiones tuvieron (tanto en las noticias falsas como las verdaderas)



In [None]:
word2vec_model.wv.most_similar('trump')

In [None]:
true_trump_words = word2vec_model.wv.most_similar('trump')
generate_pie_chart(true_trump_words, "Palabras asociadas a Trump en True.csv")

In [None]:
word2vec_model_f.wv.most_similar('trump')

In [None]:
fake_trump_words = word2vec_model_f.wv.most_similar('trump')
generate_pie_chart(fake_trump_words, "Palabras asociadas a Trump en Fake.csv")

In [None]:
list(zip(*word2vec_model_f.wv.most_similar('trump')))[0]

Ahora, apoyados de los resultados obtenidos en la investigación de `CountVectorizer`, se obtendrán las palabras relacionadas para cada una de las que más aparecieron en cada tipo de noticias.

In [None]:
fake_keywords = [
    'medium', 'donald', 'black', 'video',
    'woman', 'com', 'featured', 'news', 
    'america', 'twitter', 'obama', 'time',
    'know', 'clinton', 'american', 'people',
    'hillary', 'like', 'image', 'trump'
]

true_keywords = [
    'said', 'reuters', 'state', 'government',
    'minister', 'official', 'united', 'china',
    'north', 'washington', 'party', 'republican',
    'leader', 'korea', 'tax', 'wednesday', 
    'house', 'tuesday', 'percent', 'senate',
]

In [None]:
fake = dict()
for word in fake_keywords:
  #r.append(list(zip(*word2vec_model_f.wv.most_similar(word)))[0])
  fake[word] =  list(zip(*word2vec_model_f.wv.most_similar(word)))[0]
  print(word, list(zip(*word2vec_model_f.wv.most_similar(word)))[0])

In [None]:
true = dict()
for word in true_keywords:
  #r.append(list(zip(*word2vec_model_f.wv.most_similar(word)))[0])
  true[word] =  list(zip(*word2vec_model.wv.most_similar(word)))[0]
  print(word, list(zip(*word2vec_model.wv.most_similar(word)))[0])

En general podemos apreciar que las noticias verdaderas usan palabras que mantienen un contexto más objetivo respecto a las falsas.