In [1018]:
import pandas as pd
import numpy as np

import re

import tensorflow as rf
from tensorflow.keras.layers import Dense, Input,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

import transformers
from transformers import BertTokenizer,TFBertModel
from transformers import DistilBertTokenizer,TFDistilBertModel,DistilBertConfig,TFBertForPreTraining
from transformers import PreTrainedTokenizer

from sklearn.model_selection import train_test_split

In [1013]:
#removes HTML tags;
#converts to lower case; 
#converts 'positive' -> 1 , 'negative' -> 0


class PreProcess :
    
    def __init__(self,df):
        self.df=df
        
    
    def pre_process(self):
        self.df['review']=self.df['review'].str.replace(r'<[^<]+?>', '')
        self.df['review']=self.df['review'].str.replace(r"\\", '')
        self.df['review']=self.df['review'].str.lower()
        self.df['sentiment']=self.df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
        return self.df
    

In [1037]:
#Adds tokens column to the dataframe


class Tokenizer:
    
    def __init__(self,df,model='distilbert-base-uncased',max_length=128):
        
        self.df=df
        self.max_length=max_length
        self.model=model
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model)
    
    def get_dict(self):
        
        def helper(sentence):
            
            ############# add_special tokens add [CLS], [SEP] etc.
            ############# max_lenght : determine the maximum length of the sentence
            ############# pad_to_max_length : add 0's to sentences less than pad length
            ############# return atttention mas: a dictionary telling if the token_id is a padded one or real one

            return dict(self.tokenizer.encode_plus(sentence,add_special_tokens = True, max_length = self.max_length, pad_to_max_length = True,  return_attention_mask = True))
        
        return self.df['review'].apply(lambda x: helper(str(x)))
        
    def tokenize(self):
        
        token_dict=self.get_dict()
        dct={'input_ids':[],'attention_mask':[]}
        
        for tok in token_dict:
            
            dct['input_ids'].append(tok['input_ids'])
            dct['attention_mask'].append(tok['attention_mask'])
            lst=np.concatenate(([np.array(dct['input_ids']),np.array(dct['attention_mask'])]),axis=1)
            lst=lst.reshape(len(lst),2,self.max_length)
            
        return tf.convert_to_tensor(dct['input_ids']),tf.convert_to_tensor((dct['attention_mask']))
        


In [1023]:
def get_model():
    trans=TFDistilBertModel.from_pretrained('distilbert-base-uncased')

    input_ids=Input(shape=(max_length,),dtype='int32',name='input_ids')
    input_attention_masks=Input(shape=(max_length,),dtype='int32',name='attention_mask')

    bert_layer=trans([input_ids,input_attention_masks])[0]

    flat_layer = Flatten()(bert_layer)

    output_layer = Dense(1, activation='sigmoid')(flat_layer)

    model = Model(inputs=[input_ids,input_attention_masks], outputs=output_layer)
    
    return model

In [1056]:
df=pd.read_csv('IMDB Dataset.csv',nrows=10)

#Preprocess Dataframe
pp=PreProcess(df)
df=pp.pre_process()

#train test split
df_train, df_test = train_test_split(df, test_size=0.2)

#Tokenize
y_train,y_test=df_train.sentiment,df_test.sentiment
token_train=Tokenizer(df_train)
token_train=token_train.tokenize()
token_test=Tokenizer(df_test)
token_test=token_test.tokenize()

In [None]:
model=get_model()
model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(token_train,y_train,validation_split=0.05,epochs=1,batch_size=1)

In [1052]:
y_pred=model.predict(token_test)
y_pred=np.array(list(map(lambda x: 1 if x>=0.5 else 0,y_pred)))