In [6]:
import os
import pandas as pd
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns
import platform
from keras.preprocessing.text import text_to_word_sequence
import multiprocessing
from gensim.models import Word2Vec

In [14]:
def download_file(file_name, base_url, destination_folder='./fake_news_dataset'):
    destination_path = os.path.join(destination_folder, file_name)
    if not os.path.exists(destination_path):
        full_url = os.path.join(base_url, file_name)
        if platform.system() == 'Windows':
            os.system(f'curl {full_url} -o {file_name}')
        else:
            os.system(f'wget {full_url} -O {file_name}')
        os.makedirs(destination_folder, exist_ok=True)
        os.rename(file_name, destination_path)
    else:
        print(f"The file {file_name} is already downloaded")

def load_data(destination_folder='./fake_news_dataset'):
    df_fake = pd.read_csv(os.path.join(destination_folder, 'Fake.csv'))
    df_true = pd.read_csv(os.path.join(destination_folder, 'True.csv'))

    for df in [df_fake, df_true]:
        df['content'] = df.apply(lambda row: row['title'] + ' ' + row['text'] if not row['title'].endswith('.') else row['title'] + row['text'], axis=1)
        df.drop(columns=['date', 'subject', 'title', 'text'], inplace=True)

    return df_fake, df_true

def tokenize_content(df):
    return [text_to_word_sequence(content) for content in df['content']]

In [15]:
url_base = 'https://raw.githubusercontent.com/sevann-radhak/procesamiento_lenguaje_natural/main/clase_2/ejercicios/raw'
download_file('Fake.csv', url_base)
download_file('True.csv', url_base)

The file Fake.csv is already downloaded
The file True.csv is already downloaded


In [16]:
df_fake, df_true = load_data()

tokens_fake = tokenize_content(df_fake)
tokens_true = tokenize_content(df_true)

In [17]:
df_fake = pd.read_csv('./fake_news_dataset/Fake.csv')
df_true = pd.read_csv('./fake_news_dataset/True.csv')

In [18]:
print(df_fake.head(5))

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [19]:
print(df_true.head(5))

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   


In [20]:
# Combinar las columnas title y text en una sola columna llamada content
df_fake['content'] = df_fake.apply(lambda row: row['title'] + '.' + row['text'] if not row['title'].endswith('.') else row['title'] + ' ' + row['text'], axis=1)
df_true['content'] = df_true.apply(lambda row: row['title'] + '.' + row['text'] if not row['title'].endswith('.') else row['title'] + ' ' + row['text'], axis=1)

# Eliminar las columnas date y subject
df_fake.drop(columns=['date', 'subject', 'title', 'text'], inplace=True)
df_true.drop(columns=['date', 'subject', 'title', 'text'], inplace=True)

In [25]:
print(df_fake.head(5))
print(f"\nTotal fake docs: {df_fake.shape[0]}")

                                             content
0   Donald Trump Sends Out Embarrassing New Year’...
1   Drunk Bragging Trump Staffer Started Russian ...
2   Sheriff David Clarke Becomes An Internet Joke...
3   Trump Is So Obsessed He Even Has Obama’s Name...
4   Pope Francis Just Called Out Donald Trump Dur...

Total fake docs: 23481


In [26]:
print(df_true.head(5))
print(f"\nTotal true docs: {df_true.shape[0]}")

                                             content
0  As U.S. budget fight looms, Republicans flip t...
1  U.S. military to accept transgender recruits o...
2  Senior U.S. Republican senator: 'Let Mr. Muell...
3  FBI Russia probe helped by Australian diplomat...
4  Trump wants Postal Service to charge 'much mor...

Total true docs: 21417


In [27]:
from keras.preprocessing.text import text_to_word_sequence

def tokenize_dataframe_content(df):
    return [text_to_word_sequence(content) for content in df['content']]

In [28]:
fake_news_tokens = tokenize_dataframe_content(df_fake)
true_news_tokens = tokenize_dataframe_content(df_true)

In [29]:
fake_news_tokens[:1]

[['donald',
  'trump',
  'sends',
  'out',
  'embarrassing',
  'new',
  'year’s',
  'eve',
  'message',
  'this',
  'is',
  'disturbing',
  'donald',
  'trump',
  'just',
  'couldn',
  't',
  'wish',
  'all',
  'americans',
  'a',
  'happy',
  'new',
  'year',
  'and',
  'leave',
  'it',
  'at',
  'that',
  'instead',
  'he',
  'had',
  'to',
  'give',
  'a',
  'shout',
  'out',
  'to',
  'his',
  'enemies',
  'haters',
  'and',
  'the',
  'very',
  'dishonest',
  'fake',
  'news',
  'media',
  'the',
  'former',
  'reality',
  'show',
  'star',
  'had',
  'just',
  'one',
  'job',
  'to',
  'do',
  'and',
  'he',
  'couldn',
  't',
  'do',
  'it',
  'as',
  'our',
  'country',
  'rapidly',
  'grows',
  'stronger',
  'and',
  'smarter',
  'i',
  'want',
  'to',
  'wish',
  'all',
  'of',
  'my',
  'friends',
  'supporters',
  'enemies',
  'haters',
  'and',
  'even',
  'the',
  'very',
  'dishonest',
  'fake',
  'news',
  'media',
  'a',
  'happy',
  'and',
  'healthy',
  'new',
  'yea

In [30]:
true_news_tokens[:1]

[['as',
  'u',
  's',
  'budget',
  'fight',
  'looms',
  'republicans',
  'flip',
  'their',
  'fiscal',
  'script',
  'washington',
  'reuters',
  'the',
  'head',
  'of',
  'a',
  'conservative',
  'republican',
  'faction',
  'in',
  'the',
  'u',
  's',
  'congress',
  'who',
  'voted',
  'this',
  'month',
  'for',
  'a',
  'huge',
  'expansion',
  'of',
  'the',
  'national',
  'debt',
  'to',
  'pay',
  'for',
  'tax',
  'cuts',
  'called',
  'himself',
  'a',
  '“fiscal',
  'conservative”',
  'on',
  'sunday',
  'and',
  'urged',
  'budget',
  'restraint',
  'in',
  '2018',
  'in',
  'keeping',
  'with',
  'a',
  'sharp',
  'pivot',
  'under',
  'way',
  'among',
  'republicans',
  'u',
  's',
  'representative',
  'mark',
  'meadows',
  'speaking',
  'on',
  'cbs’',
  '“face',
  'the',
  'nation',
  '”',
  'drew',
  'a',
  'hard',
  'line',
  'on',
  'federal',
  'spending',
  'which',
  'lawmakers',
  'are',
  'bracing',
  'to',
  'do',
  'battle',
  'over',
  'in',
  'janua

In [31]:
from gensim.models.callbacks import CallbackAny2Vec
class callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.loss_previous_step = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print(f'Loss after epoch {self.epoch}: {loss - self.loss_previous_step}')
        self.epoch += 1
        self.loss_previous_step = loss

In [35]:
w2v_fake_model = Word2Vec(min_count=5,
                     window=2,
                     vector_size=300,
                     negative=20,
                     workers=multiprocessing.cpu_count(),
                     sg=1,
                     alpha=0.01,
                     min_alpha=0.0001)

w2v_true_model = Word2Vec(min_count=5,
                     window=2,
                     vector_size=300,
                     negative=20,
                     workers=multiprocessing.cpu_count(),
                     sg=1,
                     alpha=0.01,
                     min_alpha=0.0001)

In [36]:
w2v_fake_model.build_vocab(fake_news_tokens)
w2v_true_model.build_vocab(true_news_tokens)

In [37]:
print(f"Total of fake docs in corpus: {w2v_fake_model.corpus_count}")
print(f"Total of true docs in corpus: {w2v_true_model.corpus_count}")

Total of fake docs in corpus: 23481
Total of true docs in corpus: 21417


In [38]:
print(f"Total of distinct words in corpus (fake):", len(w2v_fake_model.wv.index_to_key))
print(f"Total of distinct words in corpus (true):", len(w2v_true_model.wv.index_to_key))

Total of distinct words in corpus (fake): 36929
Total of distinct words in corpus (true): 31615


In [39]:
w2v_fake_model.train(fake_news_tokens,
                 total_examples=w2v_fake_model.corpus_count,
                 epochs=20,
                 compute_loss=True,
                 callbacks=[callback()])

Loss after epoch 0: 34191784.0
Loss after epoch 1: 31967132.0
Loss after epoch 2: 8160900.0
Loss after epoch 3: 7423784.0
Loss after epoch 4: 7365152.0
Loss after epoch 5: 7283480.0
Loss after epoch 6: 7231296.0
Loss after epoch 7: 7174336.0
Loss after epoch 8: 7132000.0
Loss after epoch 9: 7014496.0
Loss after epoch 10: 6940216.0
Loss after epoch 11: 2333152.0
Loss after epoch 12: 0.0
Loss after epoch 13: 0.0
Loss after epoch 14: 0.0
Loss after epoch 15: 0.0
Loss after epoch 16: 0.0
Loss after epoch 17: 0.0
Loss after epoch 18: 0.0
Loss after epoch 19: 0.0


(166335726, 211941620)

In [40]:
w2v_true_model.train(true_news_tokens,
                 total_examples=w2v_true_model.corpus_count,
                 epochs=20,
                 compute_loss=True,
                 callbacks=[callback()])

Loss after epoch 0: 27596538.0
Loss after epoch 1: 24277890.0
Loss after epoch 2: 17299172.0
Loss after epoch 3: 6221272.0
Loss after epoch 4: 6142440.0
Loss after epoch 5: 6115944.0
Loss after epoch 6: 6044944.0
Loss after epoch 7: 5961736.0
Loss after epoch 8: 5875024.0
Loss after epoch 9: 5845608.0
Loss after epoch 10: 5762272.0
Loss after epoch 11: 5706888.0
Loss after epoch 12: 5683488.0
Loss after epoch 13: 5630280.0
Loss after epoch 14: 54232.0
Loss after epoch 15: 0.0
Loss after epoch 16: 0.0
Loss after epoch 17: 0.0
Loss after epoch 18: 0.0
Loss after epoch 19: 0.0


(135088592, 172622820)

### PALABRAS MAS RELACIONADAS CON...

In [44]:
def compare_most_similar(word, topn=10):
    fake_similar = w2v_fake_model.wv.most_similar(positive=[word], topn=topn)
    true_similar = w2v_true_model.wv.most_similar(positive=[word], topn=topn)

    print(f"Comparing most similar words for: '{word}'\n")
    print(f"{'Fake News':<20}{'True News':<20}")
    print(f"{'-'*40}")
    for fake_word, true_word in zip(fake_similar, true_similar):
        print(f"{fake_word[0]:<20}{true_word[0]:<20}")

In [45]:
compare_most_similar("media", topn=10)

Comparing most similar words for: 'media'

Fake News           True News           
----------------------------------------
mainstream          newspapers          
outlets             outlets             
dutiful             networking          
braindead           “recent             
medias              “chinese            
blogs               outlet              
storyline           zbc                 
media”              brandwatch          
technocrats         mizan               
hypocritically      television          


In [77]:
compare_most_similar("donald", topn=10)

Comparing most similar words for: 'donald'

Fake News           True News           
----------------------------------------
trump               trump               
2017donald          presumptive         
2016donald          vietnam's           
“president           trump              
impeaching          “unfit”             
majorly             insane              
cthulhu             anoint              
pussygrabber        disavowal           
bloviating          'dangerous'         
hesitance           varona              


In [80]:
compare_most_similar("sex", topn=10)

Comparing most similar words for: 'sex'

Fake News           True News           
----------------------------------------
anal                marriage            
interracial         homosexual          
heterosexual        heterosexual        
offender            adultery            
infidelity          reassignment        
molestation         sexual              
forcible            unprotected         
marital             abuser              
molesters           consensual          
oral                couples             


In [54]:
compare_most_similar("macri", topn=10)

Comparing most similar words for: 'macri'

Fake News           True News           
----------------------------------------
mauricio            mauricio            
duda                bachelet            
argentine           cristina            
jinping             cambiemos           
martelly            fernandez           
pussygrabber        argentina's         
yugoslav            sirisena            
jae                 muhammadu           
nicol               duda                
xi                  temer               


In [129]:
compare_most_similar("politics", topn=10)

Comparing most similar words for: 'politics'

Fake News           True News           
----------------------------------------
shushwalshe         demographics        
rancor              greatness           
injecting           contemporary        
fumbles             brazenly            
compost             quintessential      
jingoism            tinge               
lifetimes           magnanimous         
disintegrated       tenor               
populism            panicking           
platitudes          polarized           


In [131]:
vector_fake = w2v_fake_model.wv.get_vector("politics")
print(vector_fake)

[ 8.33607972e-01  2.72159189e-01 -3.54798198e-01  1.00868323e-03
 -3.86221498e-01 -4.26930152e-02  8.15578401e-02  2.73480415e-01
 -9.98178795e-02 -2.66481906e-01 -1.70828164e-01  6.83239251e-02
  6.52635470e-02  5.50361313e-02  2.88055748e-01  3.92540753e-01
  2.40204215e-01 -3.68824929e-01 -8.19002166e-02 -2.92589694e-01
 -3.19968998e-01 -1.69725209e-01  4.44329113e-01  1.35942791e-02
  1.83561265e-01 -2.29496717e-01  1.81558147e-01  4.70433325e-01
 -2.10360229e-01  2.92138517e-01  3.93867306e-02  1.94489896e-01
 -2.08154321e-01 -8.83649066e-02  1.85477540e-01  3.61413583e-02
  6.02727123e-02  3.42518210e-01 -8.13904524e-01  6.96309060e-02
 -1.87848195e-01  4.02047098e-01  2.06271604e-01  1.32631838e-01
  2.12970391e-01  2.58039981e-01  1.31209061e-01 -4.00420338e-01
  9.00510773e-02  7.73150146e-01 -2.50560611e-01 -3.53874564e-02
 -1.63522899e-01  2.74316490e-01  7.91798294e-01  8.07319731e-02
  4.48265821e-01 -4.92908806e-01  3.02883446e-01  3.71947140e-01
  3.18499833e-01 -3.64918

In [133]:
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
import numpy as np

def reduce_dimensions(model, num_dimensions = 2 ):

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    return vectors, labels

In [134]:
import plotly.graph_objects as go
import plotly.express as px

vecs_fake, labels_fake = reduce_dimensions(w2v_fake_model)

MAX_WORDS=200
fig = px.scatter(x=vecs_fake[:MAX_WORDS,0], y=vecs_fake[:MAX_WORDS,1], text=labels_fake[:MAX_WORDS])
fig.show(renderer="colab")

In [135]:
vecs_true, labels_true = reduce_dimensions(w2v_true_model)

MAX_WORDS=200
fig = px.scatter(x=vecs_true[:MAX_WORDS,0], y=vecs_true[:MAX_WORDS,1], text=labels_true[:MAX_WORDS])
fig.show(renderer="colab")