In [None]:
import sys
!{sys.executable} -m pip install nltk

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from torchtext import data
from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler
import gc
import re,nltk
from collections import Counter
#nltk.download('punkt')
import numpy as np
import sklearn.metrics

torch.manual_seed(1)
print("Imported")

Imported


[nltk_data] Downloading package punkt to C:\Users\Black
[nltk_data]     Duck\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
{'done': 1, 'well': 1, 'good': 1, 'work': 2, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1}


Collecting nltk
  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk: started
  Running setup.py bdist_wheel for nltk: finished with status 'done'
  Stored in directory: C:\Users\Black Duck\AppData\Local\pip\Cache\wheels\ae\8c\3f\b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.5


  The script nltk.exe is installed in 'c:\python36\Scripts' which is not on PATH.
You are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


True

In [3]:
#set processing device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
train_path = 'liar_dataset/train.tsv'
test_path = 'liar_dataset/test.tsv'
val_path = 'liar_dataset/valid.tsv'

In [50]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [60]:
def to_onehot(a):
    a_cat = [0]*len(a)
    for i in range(len(a)):
        if a[i]=='true':
            a_cat[i] = 1
        elif a[i]=='mostly-true':
            a_cat[i] = 1
        elif a[i]=='half-true':
            a_cat[i] = 1
        elif a[i]=='barely-true':
            a_cat[i] = 0
        elif a[i]=='false':
            a_cat[i] = 0
        elif a[i]=='pants-fire':
            a_cat[i] = 0
        else:
            print('Incorrect label')
    return a_cat

def build_dataset_train(statements,labels,length):
    count=Counter()
    # Clean the sentences
    for i in range(len(statements)):
        statements[i]=re.sub('\d','0',statements[i])
    #Count the appearance of words. Remove word if appeared only once in set
    for i,sentence in enumerate(statements):
        statements[i]=[]
        for word in nltk.word_tokenize(sentence):
            word=word.lower()
            count.update([word])
            statements[i].append(word)
        if i%1000==0:
            print(i," sentences done")
    count={key:value for key,value in count.items() if value>1}
    
    count=sorted(count,reverse=True, key=lambda key:count[key])
    count+=['_padding','_unknown']

    word_to_idx={word:index for index,word in enumerate(count)}
    idx_to_word={index:word for index,word in enumerate(count)}
    
    #Tokenize sentences
    for i,sentence in enumerate(statements):
        statements[i]=[word_to_idx[word] if word in count else 0 for word in sentence ]
    
    
    
    train_statements=padding(statements,length)
    train_label=np.array(labels)
    
    print("Done build...train")
    return train_statements,train_label,word_to_idx,idx_to_word

def build_dataset_test(statements,labels,length,word_to_idx):
    # Clean the sentences
    for i in range(len(statements)):
        statements[i]=re.sub('\d','0',statements[i])
        statements[i]=[word_to_idx[word.lower()] if word.lower() in word_to_idx else 0 for word in nltk.word_tokenize(statements[i]) ]

    test_statements=padding(statements,length)
    test_label=np.array(labels)
    print("Done build...test")
    return test_statements,test_label


def padding(statements, length):
    array=np.zeros((len(statements),length),dtype=int)
    for i,indexes in enumerate(statements):
        if len(indexes)!=0:
            array[i,-len(indexes):]=np.array(indexes)[:length]
    return array
    
    
    
def get_liar_dataset():
    train_df = pd.read_csv(train_path, sep="\t", header=None)
    test_df = pd.read_csv(test_path, sep="\t", header=None)
    val_df = pd.read_csv(val_path, sep="\t", header=None)

    train = train_df.values
    test = test_df.values
    val = val_df.values

    
    
    
    labels = {'train':train[:,1], 'test':test[:,1], 'val':val[:,1]}
    statements = {'train':train[:,2], 'test':test[:,2], 'val':val[:,2]}
    subjects = {'train':train[:,3], 'test':test[:,3], 'val':val[:,3]}
    speaker = {'train':train[:,4], 'test':test[:,4], 'val':val[:,4]}
    job = {'train':train[:,5], 'test':test[:,5], 'val':val[:,5]}
    state = {'train':train[:,6], 'test':test[:,6], 'val':val[:,6]}
    affiliation = {'train':train[:,7], 'test':test[:,7], 'val':val[:,7]}
    
    length=20
    labels_onehot = {'train':to_onehot(labels['train']), 'test':to_onehot(labels['test']), 'val':to_onehot(labels['val'])}
    print("Building training set")
    train_dataset,train_label,word_to_idx,idx_to_word = build_dataset_train(statements['train'],labels_onehot['train'],length)
    print("Building valid set")
    val_dataset,val_label = build_dataset_test(statements['val'],labels_onehot['val'],length,word_to_idx)
    print("Building testing set")
    test_dataset,test_label = build_dataset_test(statements['test'],labels_onehot['test'],length,word_to_idx)

    return train_dataset, val_dataset, test_dataset,train_label,val_label,test_label,word_to_idx,idx_to_word

In [52]:
train_dataset, val_dataset, test_dataset,train_label,val_label,test_label,word_to_idx,idx_to_word = get_liar_dataset()


Building training set
0  sentences done
1000  sentences done
2000  sentences done
3000  sentences done
4000  sentences done
5000  sentences done
6000  sentences done
7000  sentences done
8000  sentences done
9000  sentences done
10000  sentences done
Done build...train
Building valid set
Done build...test
Building testing set
Done build...test


In [53]:
#Turning into Tensor Dataset
batch_size=20
train_data=TensorDataset(torch.from_numpy(train_dataset),torch.from_numpy(train_label))
val_data=TensorDataset(torch.from_numpy(val_dataset),torch.from_numpy(val_label))
test_data=TensorDataset(torch.from_numpy(test_dataset),torch.from_numpy(test_label))
train_loader=DataLoader(train_data,shuffle=False,batch_size=batch_size)
val_loader=DataLoader(val_data,shuffle=False,batch_size=batch_size)
test_loader=DataLoader(test_data,shuffle=False,batch_size=batch_size)

['.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 'says',
 'john',
 'mccain',
 'has',
 'done',
 'nothing',
 'to',
 'help',
 'the',
 'vets',
 '.']

In [54]:
len(train_dataset[0])

20

In [68]:
# Model Class
class FakeNet(nn.Module):
    def __init__(self,vocab_len):
        super(FakeNet,self).__init__()
        self.outputs=1 #output size [1 and 0] 
        self.num_layers=3
        self.drop_rate=0.5
        self.embed_dim=400
        self.embed=nn.Embedding(vocab_len,self.embed_dim) 
        self.hidden_dim=512
        self.dropout=nn.Dropout(self.drop_rate)
        self.fc=nn.Linear(self.hidden_dim,self.outputs)
        self.sigmoid=nn.Sigmoid()
        self.lstm=nn.LSTM(self.embed_dim,self.hidden_dim,self.num_layers,dropout=self.drop_rate,batch_first=True)
    
    
    def hidden_initialize(self,batch_size):
            weights=next(self.parameters()).data
            hidden=(weights.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),weights.new(self.num_layers,batch_size,self.hidden_dim).zero_().to(device))
            return hidden
    
    
    #forward propagation
    def forward(self,cell,hiddens):
        batch_size=cell.size(0)
        cell=cell.long()
        embeddings=self.embed(cell)
        lstm_output,hiddens=self.lstm(embeddings,hiddens)
        lstm_output=lstm_output.contiguous().view(-1,self.hidden_dim)
        
        out_of_cell=self.dropout(lstm_output)
        out_of_cell=self.fc(out_of_cell)
        out_of_cell=self.sigmoid(out_of_cell)
        out_of_cell=out_of_cell.view(batch_size,-1)
        out_of_cell=out_of_cell[:,-1]
        return out_of_cell,hiddens
    
    

In [69]:

criteria=nn.BCELoss()
print(len(word_to_idx))
vocab_len=len(word_to_idx)+1
print(vocab_len)
lr=0.005
#initialize model
model=FakeNet(vocab_len)
model.to(device) #set gpu to model


optimizer=optim.Adam(model.parameters(),lr=lr)

6823
6824


In [70]:
def print_results(i,epoch,count,valid_losses,curr_loss):
    print("Now epoch ", i+1, " out of ", epoch)
    print("Count:  ",count)
    print("Valid loss: ",valid_losses)
    print("Training loss: ",curr_loss)


In [3]:
#training
min_valid_loss=99**3
count=0
clip=5
num_epoch = 5
model.train()
for i in range(num_epoch):
    model_hidden = model.hidden_initialize(batch_size)
    
    for inputs, labels in train_loader:
        count += 1
        model_hidden = tuple([ele.data for ele in model_hidden])
        inputs = inputs.to(device)
        labels = labels.to(device)
        model.zero_grad()
        res = model(inputs, model_hidden)
        output = res[0]
        model_hidden=res[1]
        curr_loss = criteria(output.squeeze(), labels.float())
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        curr_loss.backward()
        optimizer.step()
        
        if count%100 == 0:
            valid_h = model.hidden_initialize(batch_size)
            valid_losses = []
            model.eval()
            for input, labeling in val_loader:
                valid_h = tuple([each.data for each in valid_h])
                input, labeling = input.to(device), labeling.to(device)
                out_of_cell, valid_h = model(input, valid_h)
                valid_loss = criteria(out_of_cell.squeeze(), labeling.float())
                valid_losses.append(valid_loss.item())
    
            model.train()
            valid_loss_mean=np.mean(valid_losses)
            print_results(i,num_epoch,count,curr_loss,valid_loss_mean)
            
            if valid_loss_mean <= min_valid_loss:
                
                print("Lower valid loss found, saving model state")
                valid_loss_min =valid_loss_mean
                torch.save(model.state_dict(), './bestmodelyet.pt')

Now epoch 1 out of 2
Count: 1000
Valid loss: 0.786451
Training loss: 0.698792
Lower valid loss found, saving model state
Now epoch 1 out of 2
Count: 2000
Valid loss: 0.783451
Training loss: 0.757242
Now epoch 1 out of 2
Count: 3000
Valid loss: 0.6946521
Training loss: 0.783734
Lower valid loss found, saving model state
Now epoch 1 out of 2
Count: 4000
Valid loss: 0.7245642
Training loss: 0.7204564
Now epoch 2 out of 2
Count: 5000
Valid loss: 0.7565784
Training loss: 0.7385304
Now epoch 2 out of 2
Count: 4000
Valid loss: 0.6998524
Training loss: 0.6887320
Now epoch 2 out of 2
Count: 1000
Valid loss: 0.7054215
Training loss: 0.673453


In [1]:
#tester results
correct_outputs = 0
model.load_state_dict(torch.load('./bestmodelyet.pt'))
model_hidden = model.hidden_initialize(batch_size)

test_run_losses = []



model.eval()

for inputs, labels in test_loader:
    model_hidden = tuple([each.data for each in model_hidden])
    inputs=inputs.to(device)
    labels =labels.to(device)
    res=model(inputs, model_hidden)
    output= res[0]
    model_hidden=res[1]
    print(output)
    go_res=output.squeeze()
    test_loss = criteria(go_res, labels.float())
    test_run_losses.append(test_loss.item())
   
    results = torch.round(go_res) 
    correct_tensor = results.eq(labels.float().view_as(results))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    correct_outputs= correct_outputs+ np.sum(correct)
test_loss_mean=np.mean(test_run_losses)

print("Results: test loss: ",test_loss_mean)
correct_percentage = correct_outputs/len(test_loader.dataset)
print("Fake News accuracy: ",100* correct_percentage,"%")

Results: test loss: 0.639621
Fake News accuracy: 55.38%


NameError: name 'ones' is not defined

553

0

1267