In [1]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)
train_data, test_data = imdb_dataset(train=True, test=True)
df = pd.read_csv("fake.csv")
df = df[['text', 'type']]
print(len(df))


from collections import Counter 

print(Counter(df['type'].values))


df = df[df['type'].isin(['fake', 'satire'])]
df.dropna(inplace = True)
df_fake = df[df['type'] == 'fake'] 
df_statire = df[df['type'] == 'satire'] 
df_statire = df_statire.sample(n=len(df_fake))
df = df_statire.append(df_fake)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['type'].values))



12999
Counter({'bs': 11492, 'bias': 443, 'conspiracy': 430, 'hate': 246, 'satire': 146, 'state': 121, 'junksci': 102, 'fake': 19})
Counter({'fake': 19, 'satire': 19})


In [2]:
train_data = df.head(19)
test_data = df.tail(19)


train_data = [{'text': text, 'type': type_data } for text in list(train_data['text']) for type_data in list(train_data['type'])]
test_data = [{'text': text, 'type': type_data } for text in list(test_data['text']) for type_data in list(test_data['type'])]
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))
print(train_texts)

('adobochron 1 Comment Moyers \nWASHINGTON, D.C. ( The Adobo Chronicles, Washington Bureau) – Billy Don “Bill” Moyers is an American journalist and political commentator who served as White House Press Secretary in the Johnson administration from 1965 to 1967. He also worked as a network TV news commentator for ten years. Moyers has been extensively involved with public broadcasting, producing documentaries and news journal programs. He has won numerous awards and honorary degrees for his investigative journalism and civic activities and has become well known as a trenchant critic of the corporately structured U.S. news media. \nWell, he is the first high-profile American to flee the U.S. as a result of the election of Donald Trump as the 45th president of the United States. \nHere is Moyers’“ Farewell to America. ”', 'adobochron 1 Comment Moyers \nWASHINGTON, D.C. ( The Adobo Chronicles, Washington Bureau) – Billy Don “Bill” Moyers is an American journalist and political commentator w

In [3]:
df_real = pd.read_csv("covid_real.csv")
df_fake = pd.read_csv("covid_fake.csv")
print(df_real.shape)
print(df_fake.shape)

(5394, 7)
(4825, 7)


In [4]:
df_fake['Original Set'].unique()

array(['fake'], dtype=object)

In [5]:

df_covid = df_real.append(df_fake)
df_covid = df_covid.sample(frac=1, random_state = 24).reset_index(drop=True)

print(df_covid.shape)

(10219, 7)


In [6]:
train_covid = df_covid.head(8219)
test_covid = df_covid.tail(2000)
print(train_covid.shape)
print(test_covid.shape)

(8219, 7)
(2000, 7)


In [7]:


train_data = [{'Text': text, 'Original Set': type_data } for text in list(train_covid['Text']) for type_data in list(train_covid['Original Set'])]
test_data = [{'Text': text, 'Original Set': type_data } for text in list(test_covid['Text']) for type_data in list(test_covid['Original Set'])]
import dill

dill.dump_session('notebook_env.db')
#dill.load_session('notebook_env.db')

In [2]:
import dill
dill.load_session('notebook_env.db')

67551961

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['Text'], d['Original Set']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['Text'], d['Original Set']), test_data)))
print(train_texts)
dill.dump_session('notebook_env.db')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
dill.dump_session('notebook_env.db')

In [None]:
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")


train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'
dill.dump_session('notebook_env.db')

In [None]:
# Run until this cell

In [None]:
#
#
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

BATCH_SIZE = 10
EPOCHS = 1


train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

print(len(train_dataloader))
print(len(test_dataloader))

dill.dump_session('notebook_env.db')

In [15]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x160ecca30>

In [17]:
counter=0
counter_limit=10
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        counter=counter+1
        if counter>counter_limit:
            break
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
counter=0
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):
        counter=counter+1
        if counter>counter_limit:
            break
        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        


Epoch:  1
0/4000.0 loss: 0.7146689891815186 
Epoch:  1
1/4000.0 loss: 0.728107213973999 
Epoch:  1
2/4000.0 loss: 0.714583953221639 
Epoch:  1
3/4000.0 loss: 0.7215386778116226 
Epoch:  1
4/4000.0 loss: 0.7231969475746155 
Epoch:  1
5/4000.0 loss: 0.719575564066569 
Epoch:  1
6/4000.0 loss: 0.7115100281579154 
Epoch:  1
7/4000.0 loss: 0.7147717177867889 
Epoch:  1
8/4000.0 loss: 0.7098198864195082 
Epoch:  1
9/4000.0 loss: 0.7059879243373871 


ValueError: Found input variables with inconsistent numbers of samples: [10000, 100]

In [24]:
print(classification_report(test_y[:100], bert_predicted))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        47
        True       0.53      1.00      0.69        53

    accuracy                           0.53       100
   macro avg       0.27      0.50      0.35       100
weighted avg       0.28      0.53      0.37       100



  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
test_y[:100].shape

(100,)