### Librerias Usadas.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD, PCA
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
from scipy.sparse import csr_matrix, hstack
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import re, string, nltk
# Descarga de recursos
nltk.download('wordnet')
nltk.download('punkt')
tf.test.gpu_device_name()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'/device:GPU:0'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Lectura de los datos

In [3]:
dataFake = pd.read_csv("/content/drive/MyDrive/Fake.csv")
dataFake["class"] = 0
print("Fake: ",dataFake.shape)

dataTrue = pd.read_csv("/content/drive/MyDrive/True.csv")
dataTrue["class"] = 1
print("True: ",dataTrue.shape)

data_merge = pd.concat([dataFake,dataTrue], axis=0)
data = data_merge.drop(["title","subject","date"], axis=1)
print("All data: ",data.shape)

Fake:  (23481, 5)
True:  (21417, 5)
All data:  (44898, 2)


## Limpieza

In [4]:
'''
  Función que elimina los símbolos especiales de un texto,
  así como las stopwords
'''
def word_cleaner(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\\W', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return remove_stopwords(text)

'''
  Función que dado un texto, lo limpia y elimina las letras aisladas existentes.
'''
def text_cleaner(text, lemmatizer):
  text = word_cleaner(text)
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  text = remove_letters(lemmatized_tokens)
  text = " ".join(text)
  return text

'''
  Funcion que elimina las palabras con lengitud menor a length
'''
def remove_letters(lemas, length=2):
  return [word for word in lemas if len(word)>length]

'''
  Funcion que cuenta la frecuencia de palabras en el dataset
'''
def count_tokens(texts, wf):
  for text in texts:
    tokens = text.split()
    wf.update(tokens)
  return wf

'''
  Funcion que dado el dataset y la lista de palabras que no tienen una 
  frecuencua valida las elimina del dataset
'''
def remove_max_min_words_freq(texts, words_to_remove):
  filtered_texts = []
  for text in texts:
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in words_to_remove]
    filtered_text = ' '.join(filtered_tokens)
    filtered_texts.append(filtered_text)
  return filtered_texts

'''
  Funcion que obtiene la lista de palabras a eliminar
'''
def get_words_to_remove(min_freq =2, max_freq=1000, word_freq=None):
  return [word for word, freq in word_freq.items() if freq < min_freq or freq > max_freq]

### Aplicación de limpieza a datos



In [5]:
clean_data = data.copy()

In [6]:
lemmatizer = WordNetLemmatizer()
clean_data["text"] = clean_data['text'].apply(text_cleaner, args=(lemmatizer,))

In [7]:
clean_fake = dataFake.copy()
lemmatizer = WordNetLemmatizer()
clean_fake['text'] = clean_fake['text'].apply(text_cleaner, args=(lemmatizer,))

In [8]:
clean_true = dataTrue.copy()
lemmatizer = WordNetLemmatizer()
clean_true['text'] = clean_true['text'].apply(text_cleaner, args=(lemmatizer,))

## Separación de datos de entrenamiento y de prueba

In [None]:
SEED = 123456789

x = clean_data['text']
y = clean_data['class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train)} ({len(x_train)/len(x):%})")
print(f"Datos de prueba: \t{len(x_test)} ({len(x_test)/len(x):%})")

Datos de entrenamiento: 33673 (74.998886%)
Datos de prueba: 	11225 (25.001114%)


In [9]:
SEED = 123456789

x_true = clean_true['text']
y_true = clean_true['class']

x_train_true, x_test_true, y_train_true, y_test_true = train_test_split(x_true, y_true, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_true)} ({len(x_train_true)/len(x_true):%})")
print(f"Datos de prueba: \t{len(x_test_true)} ({len(x_test_true)/len(x_true):%})")

Datos de entrenamiento: 16062 (74.996498%)
Datos de prueba: 	5355 (25.003502%)


In [15]:
x_fake = clean_fake['text']
y_fake = clean_fake['class']

x_train_fake, x_test_fake, y_train_fake, y_test_fake = train_test_split(x_fake, y_fake, test_size=0.25, random_state=SEED)

print(f"Datos de entrenamiento: {len(x_train_fake)} ({len(x_train_fake)/len(x_fake):%})")
print(f"Datos de prueba: \t{len(x_test_fake)} ({len(x_test_fake)/len(x_fake):%})")

Datos de entrenamiento: 17610 (74.996806%)
Datos de prueba: 	5871 (25.003194%)


### Vectorización por Word2Vec


In [11]:
X = pd.concat([x_train_true, x_test_true], axis=0)
sentences = [text.split() for text in X]
word2vec_model = Word2Vec(sentences, min_count=2)

In [None]:
word2vec_model.wv.index_to_key[:100]

In [14]:
word2vec_model.wv.most_similar('trump')

[('elect', 0.6293635964393616),
 ('surprise', 0.489088237285614),
 ('republican', 0.4756985902786255),
 ('obama', 0.4649403989315033),
 ('washington', 0.45603862404823303),
 ('ayer', 0.45538219809532166),
 ('romney', 0.45269158482551575),
 ('clinton', 0.44878089427948),
 ('rubio', 0.43388909101486206),
 ('bush', 0.4323711097240448)]

In [16]:
X_fake = pd.concat([x_train_fake, x_test_fake], axis=0)
sentences_fake = [text.split() for text in X_fake]
word2vec_model_f = Word2Vec(sentences_fake, min_count=2)

In [17]:
word2vec_model_f.wv.most_similar('trump')

[('elect', 0.634766161441803),
 ('hasn', 0.5471111536026001),
 ('actually', 0.5372609496116638),
 ('proving', 0.5154432058334351),
 ('conway', 0.501865029335022),
 ('pathetic', 0.49315014481544495),
 ('loyal', 0.481100469827652),
 ('yammer', 0.4697333872318268),
 ('insisting', 0.4696291387081146),
 ('embarrassing', 0.46709489822387695)]

In [18]:
words_true = set(word2vec_model.wv.index_to_key)
X_train_vect_true = np.array([np.array([word2vec_model.wv[i] for i in ls if i in words_true]) for ls in x_train_true])
X_test_vect_true = np.array([np.array([word2vec_model.wv[i] for i in ls if i in words_true]) for ls in x_test_true])

In [29]:
X_train_vect_avg_true = []
for v in X_train_vect_true:
    if v.size:
        X_train_vect_avg_true.append(v.mean(axis=0))
    else:
        X_train_vect_avg_true.append(np.zeros(1, dtype=float))
        
X_test_vect_avg_true = []
for v in X_test_vect_true:
    if v.size:
        X_test_vect_avg_true.append(v.mean(axis=0))
    else:
        X_test_vect_avg_true.append(np.zeros(1, dtype=float))

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg_true, y_train_true.values.ravel())

In [31]:
y_pred = rf_model.predict(X_test_vect_avg_true)

In [32]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test_true, y_pred)
recall = recall_score(y_test_true, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test_true).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 1.0 / Accuracy: 1.0


In [38]:
words_fake = set(word2vec_model_f.wv.index_to_key)
X_train_vect_fake = np.array([np.array([word2vec_model_f.wv[i] for i in ls if i in words_fake]) for ls in x_train_fake])
X_test_vect_fake = np.array([np.array([word2vec_model_f.wv[i] for i in ls if i in words_fake]) for ls in x_test_fake])

X_train_vect_avg_fake = []
for v in X_train_vect_fake:
    if v.size:
        X_train_vect_avg_fake.append(v.mean(axis=0))
    else:
        X_train_vect_avg_fake.append(np.zeros(10, dtype=float))
        
X_test_vect_avg_fake = []
for v in X_test_vect_fake:
    if v.size:
        X_test_vect_avg_fake.append(v.mean(axis=0))
    else:
        X_test_vect_avg_fake.append(np.zeros(10, dtype=float))

In [39]:
rf_f = RandomForestClassifier()
rf_model_f = rf_f.fit(X_train_vect_avg_fake, y_train_fake.values.ravel())

y_pred_f = rf_model_f.predict(X_test_vect_avg_fake)

In [40]:
precision_f = precision_score(y_test_fake, y_pred_f)
recall_f = recall_score(y_test_fake, y_pred_f)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_f, 3), round(recall_f, 3), round((y_pred_f == y_test_fake).sum()/len(y_pred_f), 3)))

Precision: 0.0 / Recall: 0.0 / Accuracy: 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
