In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install trax

In [None]:
import trax
import trax.layers as tl
import trax.fastmath.numpy as numpy

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import random
import itertools

In [None]:
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
real = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
fake.head()

In [None]:
fake.shape

In [None]:
real.shape

In [None]:
fake.subject.value_counts()

In [None]:
real.subject.value_counts()

In [None]:
fake['isfake'] =1
real['isfake'] = 0

In [None]:
data = pd.concat([fake,real],ignore_index=True)
data.head()

In [None]:
data.loc[0,'text']

In [None]:
data['text'] = data['title']+data['text']
data.drop(['title','subject','date'],axis=1,inplace=True)
data.head()

In [None]:
x = list(data['text'].values)

In [None]:
tokenizer = Tokenizer(num_words=75000,oov_token='<UNK>')
tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)

In [None]:
config = tokenizer.get_config()
config.keys()

In [None]:
len(x[0])

In [None]:
sum([len(i) for i in x])/len(x)  # avg length of each sequence

In [None]:
x = pad_sequences(x,maxlen=600,padding='post',truncating='post')

In [None]:
x.shape

In [None]:
targets = list(data['isfake'].values)

In [None]:
def data_generator(data,targets,batch_size,shuffle=False):
    
    index = 0
    lines_index = [*range(len(data))]
    if shuffle:
        random.shuffle(lines_index)
        
    batch_inputs = []
    batch_targets = []
    
    while True:
        
        
        if index >= len(data):
            index=0
            if shuffle:
                random.shuffle(lines_index)
            
        batch_input = data[lines_index[index]]
        batch_target = targets[lines_index[index]]
        batch_inputs.append(batch_input)
        batch_targets.append(batch_target)
        index += 1
            
        if len(batch_inputs) == batch_size:
            masks=np.ones_like(batch_targets)
            batch_inputs = numpy.array(batch_inputs)
            batch_targets = numpy.array(batch_targets)
            masks = numpy.array(masks)
                
            yield batch_inputs,batch_targets,masks
            
            batch_inputs = []
            batch_targets = []

In [None]:
next(data_generator(x,targets,10,shuffle=True))

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,targets,test_size=0.2,shuffle=True)

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
def get_model(vocab_size=75000,d_model=512):
    
    model = tl.Serial(
        tl.Embedding(vocab_size=vocab_size,d_feature=d_model),
        tl.Mean(axis=1),
        tl.Dense(n_units=2),
        tl.LogSoftmax()
    )
    return model

In [None]:
model =get_model()
print(model)

In [None]:
from trax.supervised import training

In [None]:
pwd

In [None]:
def train_model(model, data_generator, batch_size=32, x_train=x_train, y_train=y_train,x_test=x_test,y_test=y_test, n_steps=1, output_dir='/kaggle/working'): 
    
    
    
    bare_train_generator = data_generator(x_train,y_train,batch_size=batch_size)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(x_test, y_test,batch_size=batch_size)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, 
        loss_layer=tl.CrossEntropyLoss(),   
        optimizer=trax.optimizers.Adam(0.005),
        n_steps_per_checkpoint = 100
    )
    
    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,    
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=30     
    )
    
    training_loop = training.Loop(model,
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)
    
    return training_loop

In [None]:
training_loop = train_model(model,data_generator,n_steps=300)

In [None]:
new_model = get_model()
new_model.init_from_file(file_name="./model.pkl.gz", weights_only=True) 

In [None]:
our_input = data.loc[2000,'text']
actual_output = data.loc[2000,'isfake']
tokenized_input = numpy.array(tokenizer.texts_to_sequences([our_input]))
sentiment_log_probs = new_model(tokenized_input)
norm_log_probs = np.exp(sentiment_log_probs)
sentiment = np.argmax(norm_log_probs[0])
print('Input :\n"{}"\nThe result is: {}'.format(our_input, "fake" if sentiment else "real"))
print('Actual result is : {}'.format('fake' if actual_output else 'real'))