### Librerias Usadas.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD, PCA
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
from scipy.sparse import csr_matrix, hstack
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import re, string, nltk
# Descarga de recursos
nltk.download('wordnet')
nltk.download('punkt')
tf.test.gpu_device_name()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


''

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Lectura de los datos

In [4]:
dataFake = pd.read_csv("/content/drive/MyDrive/Fake.csv")
dataFake["class"] = 0
print("Fake: ",dataFake.shape)

dataTrue = pd.read_csv("/content/drive/MyDrive/True.csv")
dataTrue["class"] = 1
print("True: ",dataTrue.shape)

data_merge = pd.concat([dataFake,dataTrue], axis=0)
data = data_merge.drop(["title","subject","date"], axis=1)
print("All data: ",data.shape)

Fake:  (23481, 5)
True:  (21417, 5)
All data:  (44898, 2)


## Limpieza

In [5]:
'''
  Función que elimina los símbolos especiales de un texto,
  así como las stopwords
'''
def word_cleaner(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\\W', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return remove_stopwords(text)

'''
  Función que dado un texto, lo limpia y elimina las letras aisladas existentes.
'''
def text_cleaner(text, lemmatizer):
  text = word_cleaner(text)
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  text = remove_letters(lemmatized_tokens)
  text = " ".join(text)
  return text

'''
  Funcion que elimina las palabras con lengitud menor a length
'''
def remove_letters(lemas, length=2):
  return [word for word in lemas if len(word)>length]

'''
  Funcion que cuenta la frecuencia de palabras en el dataset
'''
def count_tokens(texts, wf):
  for text in texts:
    tokens = text.split()
    wf.update(tokens)
  return wf

'''
  Funcion que dado el dataset y la lista de palabras que no tienen una 
  frecuencua valida las elimina del dataset
'''
def remove_max_min_words_freq(texts, words_to_remove):
  filtered_texts = []
  for text in texts:
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in words_to_remove]
    filtered_text = ' '.join(filtered_tokens)
    filtered_texts.append(filtered_text)
  return filtered_texts

'''
  Funcion que obtiene la lista de palabras a eliminar
'''
def get_words_to_remove(min_freq =2, max_freq=1000, word_freq=None):
  return [word for word, freq in word_freq.items() if freq < min_freq or freq > max_freq]

### Aplicación de limpieza a datos



In [None]:
clean_data = data.copy()

In [None]:
lemmatizer = WordNetLemmatizer()
clean_data["text"] = clean_data['text'].apply(text_cleaner, args=(lemmatizer,))

In [6]:
clean_fake = dataFake.copy()
lemmatizer = WordNetLemmatizer()
clean_fake['text'] = clean_fake['text'].apply(text_cleaner, args=(lemmatizer,))

In [7]:
clean_true = dataTrue.copy()
lemmatizer = WordNetLemmatizer()
clean_true['text'] = clean_true['text'].apply(text_cleaner, args=(lemmatizer,))

## Separación de datos de entrenamiento y de prueba

In [None]:
SEED = 123456789

x = clean_data['text']
y = clean_data['class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train)} ({len(x_train)/len(x):%})")
print(f"Datos de prueba: \t{len(x_test)} ({len(x_test)/len(x):%})")

In [8]:
SEED = 123456789

x_true = clean_true['text']
y_true = clean_true['class']

x_train_true, x_test_true, y_train_true, y_test_true = train_test_split(x_true, y_true, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_true)} ({len(x_train_true)/len(x_true):%})")
print(f"Datos de prueba: \t{len(x_test_true)} ({len(x_test_true)/len(x_true):%})")

Datos de entrenamiento: 16062 (74.996498%)
Datos de prueba: 	5355 (25.003502%)


In [9]:
x_fake = clean_fake['text']
y_fake = clean_fake['class']

x_train_fake, x_test_fake, y_train_fake, y_test_fake = train_test_split(x_fake, y_fake, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_fake)} ({len(x_train_fake)/len(x_fake):%})")
print(f"Datos de prueba: \t{len(x_test_fake)} ({len(x_test_fake)/len(x_fake):%})")

Datos de entrenamiento: 17610 (74.996806%)
Datos de prueba: 	5871 (25.003194%)


### Word2Vec


In [10]:
X = pd.concat([x_train_true, x_test_true], axis=0)
sentences = [text.split() for text in X]
word2vec_model = Word2Vec(sentences, min_count=2)

In [None]:
word2vec_model.wv.index_to_key[:100]

In [11]:
X_fake = pd.concat([x_train_fake, x_test_fake], axis=0)
sentences_fake = [text.split() for text in X_fake]
word2vec_model_f = Word2Vec(sentences_fake, min_count=2)

Usando la función `most_similar`, buscamos para cada modelo las palabras relacionadas a una de las que más repeticiones tuvieron (tanto en las noticias falsas como las verdaderas)



In [12]:
word2vec_model.wv.most_similar('trump')

[('elect', 0.6491174697875977),
 ('washington', 0.485962450504303),
 ('obama', 0.48377183079719543),
 ('bush', 0.4748460054397583),
 ('republican', 0.4728362262248993),
 ('surprise', 0.4660532474517822),
 ('clinton', 0.46550247073173523),
 ('incoming', 0.45599403977394104),
 ('presumptive', 0.45119884610176086),
 ('cruz', 0.4486404061317444)]

In [13]:
word2vec_model_f.wv.most_similar('trump')

[('elect', 0.6708611845970154),
 ('actually', 0.5508329272270203),
 ('hasn', 0.5336906909942627),
 ('amateur', 0.5105623006820679),
 ('conway', 0.501033365726471),
 ('repeatedly', 0.49410584568977356),
 ('pathetic', 0.4832920730113983),
 ('embarrassing', 0.47700735926628113),
 ('brag', 0.4723009467124939),
 ('instead', 0.47010910511016846)]

In [14]:
list(zip(*word2vec_model_f.wv.most_similar('trump')))[0]

('elect',
 'actually',
 'hasn',
 'amateur',
 'conway',
 'repeatedly',
 'pathetic',
 'embarrassing',
 'brag',
 'instead')

Ahora, apoyados de los resultados obtenidos en la investigación de `CountVectorizer`, se obtendrán las palabras relacionadas para cada una de las que más aparecieron en cada tipo de noticias.

In [33]:
fake_keywords = [
    'medium', 'donald', 'black', 'video',
    'woman', 'com', 'featured', 'news', 
    'america', 'twitter', 'obama', 'time',
    'know', 'clinton', 'american', 'people',
    'hillary', 'like', 'image', 'trump'
]

true_keywords = [
    'said', 'reuters', 'state', 'government',
    'minister', 'official', 'united', 'china',
    'north', 'washington', 'party', 'republican',
    'leader', 'korea', 'tax', 'wednesday', 
    'house', 'tuesday', 'percent', 'senate',
]

In [34]:
fake = dict()
for word in fake_keywords:
  #r.append(list(zip(*word2vec_model_f.wv.most_similar(word)))[0])
  fake[word] =  list(zip(*word2vec_model_f.wv.most_similar(word)))[0]
  print(word, list(zip(*word2vec_model_f.wv.most_similar(word)))[0])

medium ('trollinga', 'occasioned', 'waht', 'presentable', 'networking', 'macroeconomics', 'irredeemable', 'problemthough', 'gravitating', 'contagians')
donald ('humiliating', 'candidacy', 'elect', 'surrogate', 'terrifying', 'pathetic', 'trashing', 'bluffing', 'supporter', 'proving')
black ('color', 'assanta', 'cop', 'hispanic', 'angry', 'african', 'lynching', 'young', 'oppressed', 'racial')
video ('clip', 'mediatate', 'footage', 'hamish', 'recording', 'prankster', 'audio', 'teamyoutube', 'reopens', 'hedgethis')
woman ('men', 'sexually', 'female', 'kissing', 'rape', 'girl', 'nationalismthe', 'harassed', 'empowe', 'angriest')
com ('http', 'braddjaffy', 'jaffy', 'huffpostpol', 'caplan', 'jacobnbc', 'jenniferjjacobs', 'joshdcaplan', 'clayaiken', 'yashar')
featured ('hopefully', 'twittertwitterfeatured', 'dailypoliticsfeatured', 'rest', 'twitterwe', 'twitterfeatured', 'bet', 'anymore', 'scrolled', 'definitely')
news ('newswatch', 'newshere', 'mccordsville', 'zedillo', 'newsbelow', 'newssorr

In [35]:
true = dict()
for word in true_keywords:
  #r.append(list(zip(*word2vec_model_f.wv.most_similar(word)))[0])
  true[word] =  list(zip(*word2vec_model.wv.most_similar(word)))[0]
  print(word, list(zip(*word2vec_model.wv.most_similar(word)))[0])

said ('added', 'told', 'adding', 'saying', 'noted', 'wrote', 'suggested', 'asked', 'described', 'acknowledged')
reuters ('morseby', 'bacalar', 'reporter', 'creighton', 'bfm', 'inquirer', 'based', 'transcribed', 'telephone', 'odell')
state ('nation', 'kingdom', 'exportation', 'emirate', 'steelworker', 'continental', 'spiriting', 'country', 'frostier', 'treasury')
government ('authority', 'participation', 'autonomy', 'failing', 'hawiye', 'extension', 'krg', 'partial', 'financing', 'sufficient')
minister ('minster', 'ministerial', 'ministry', 'exponent', 'yue', 'kipp', 'theresamay', 'premier', 'bnd', 'drian')
official ('source', 'diplomat', 'aide', 'anonymity', 'authority', 'zentaro', 'authorized', 'personnel', 'requested', 'separately')
united ('gulf', 'islamic', 'reactivate', 'galmudug', 'koro', 'dreadful', 'isolated', 'vassal', 'duma', 'bessho')
china ('beijing', 'chinese', 'taiwan', 'india', 'japan', 'vietnam', 'strait', 'bilateral', 'taipei', 'korea')
north ('dpr', 'south', 'peninsul

En general podemos apreciar que las noticias verdaderas usan palabras que mantienen un contexto más objetivo respecto a las falsas.