In [None]:
def find_sentiment(sentence, pos, neg):
    
    """
    This function returns sentiment of sentence
    :param sentence: sentence, a string
    :param pos: set of positive words
    :param neg: set of negative words
    :return: returns positive, negative or neutral sentiment
    """
    
    # split sentence by a space
    sentence = sentence.split()
    
    sentence = set(sentence)
    
    # check number of common words with positive
    num_common_pos = len(sentence.intersection(pos))
    
    num_common_neg = len(sentence.intersection(neg))
    
    if num_common_pos > num_common_neg:
        return("positive")
    if num_common_pos < num_common_neg:
        return("negative")
    return("neutral")
    

In [None]:
# tokenization splits sentence into list of words
from nltk.tokenize import word_tokenize

sentence = "hi, how are you?"

print(sentence.split())

print(word_tokenize(sentence))

In [None]:
# classification problem : Bag of Words
# bag of words we create a sparse matrix that stores counts of all words in our corpus
# (corpus = all the documents = all the sentences)
#  CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works",
    "YES!!!!"
]

ctv = CountVectorizer()

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)



In [None]:
print(corpus_transformed)

In [None]:
print(ctv.vocabulary_)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

# create a corpus of sentences
corpus = [
 "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]
# initialize CountVectorizer with word_tokenize from nltk
# as the tokenizer
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)


In [None]:
# import what we need 
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

if __name__=="__main__":
    
    df = pd.read_csv("../input/cleaned-imdb/imdb_clean.csv")
    
   
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.sentiment.values
    
    kf = model_selection.StratifiedKFold(n_splits=5)
    # fill the new kfold column
    for f, (t_,v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_,"kfold"] = f
    for fold_ in range(5):
        # temporary dataframes for train and test
        train_df = df[df.kfold != fold_ ].reset_index(drop=True)
        test_df = df[df.kfold == fold_ ].reset_index(drop=True)
        
        count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
        )
        count_vec.fit(train_df.reviews)
        
        xtrain = count_vec.transform(train_df.reviews)
        xtest = count_vec.transform(test_df.reviews)
        
        model = linear_model.LogisticRegression()
        
        model.fit(xtrain, train_df.sentiment)
        preds = model.predict(xtest)
        
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
# create a corpus of sentences
corpus = [
    "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]
# initialize TfidfVectorizer with word_tokenize from nltk
# as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)
print(corpus_transformed)


In [None]:
"""
We see that instead of integer values, this time we get floats. Replacing
CountVectorizer with TfidfVectorizer is also a piece of cake. Scikit-learn also offers
TfidfTransformer. If you have count values, you can use TfidfTransformer and get
the same behaviour as TfidfVectorizer. 
"""
# import what we need
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

 # we go over the folds created
for fold_ in range(5):
 # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
 # initialize TfidfVectorizer with NLTK's word_tokenize
 # function as tokenizer
    tfidf_vec = TfidfVectorizer(
    tokenizer=word_tokenize,
    token_pattern=None
 )
 # fit tfidf_vec on training data reviews
    tfidf_vec.fit(train_df.reviews)
 # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df.reviews)
    xtest = tfidf_vec.transform(test_df.reviews)
 # initialize logistic regression model
    model = linear_model.LogisticRegression()
 # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df.sentiment)
 # make predictions on test data
 # threshold for predictions is 0.5
    preds = model.predict(xtest)
 # calculate accuracy
    accuracy = metrics.accuracy_score(test_df.sentiment, preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")


In [None]:
"""
Another interesting concept in NLP is n-grams. N-grams are combinations of
words in order. N-grams are easy to create. You just need to take care of the order.
To make things even more comfortable, we can use n-gram implementation from
NLTK.
"""

In [None]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
# let's see 3 grams
N = 3
# input sentence
sentence = "hi, how are you?"
# tokenized sentence
tokenized_sentence = word_tokenize(sentence)
# generate n_grams
n_grams = list(ngrams(tokenized_sentence, N))
print(n_grams)


In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

 # we go over the folds created
for fold_ in range(5):
 # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
 # initialize TfidfVectorizer with NLTK's word_tokenize
 # function as tokenizer
    tfidf_vec = TfidfVectorizer(
    tokenizer=word_tokenize,
    token_pattern=None,
    ngram_range=(1,3)
 )
 # fit tfidf_vec on training data reviews
    tfidf_vec.fit(train_df.reviews)
 # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df.reviews)
    xtest = tfidf_vec.transform(test_df.reviews)
 # initialize logistic regression model
    model = linear_model.LogisticRegression()
 # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df.sentiment)
 # make predictions on test data
 # threshold for predictions is 0.5
    preds = model.predict(xtest)
 # calculate accuracy
    accuracy = metrics.accuracy_score(test_df.sentiment, preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

In [None]:
# Stemming and lemmatization
#  lemmatization is more aggressive than stemming and stemming is
# more popular and widely used.
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# initialize stemmer
stemmer = SnowballStemmer("english")
words = ["fishing", "fishes", "fished"]
for word in words:
    print(f"word={word}")
    print(f"stemmed_word={stemmer.stem(word)}")
    print(f"lemma={lemmatizer.lemmatize(word)}")
    print("")



In [None]:
## One more topic that you should be aware of is topic extraction. Topic extraction
## can be done using non-negative matrix factorization (NMF) or latent semantic
## analysis (LSA), which is also popularly known as singular value decomposition or
## SVD. These are decomposition techniques that reduce the data to a given number
## of components. You can fit any of these on sparse matrix obtained from
## CountVectorizer or TfidfVectorizer. 
##
##
##
##
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
# create a corpus of sentences
# we read only 10k samples from training data
# for this example
corpus = pd.read_csv("../input/cleaned-imdb/imdb_clean.csv", nrows=10000)
corpus = corpus.review.values
# initialize TfidfVectorizer with word_tokenize from nltk
# as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
# transform the corpus using tfidf
corpus_transformed = tfv.transform(corpus)
# initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)
# fit SVD
corpus_svd = svd.fit(corpus_transformed)
sample_index = 0
feature_scores = dict(
 zip(
 tfv.get_feature_names(),
 corpus_svd.components_[sample_index]
 )
)
N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])


In [None]:
N = 5
for sample_index in range(5):
 feature_scores = dict(
 zip(
 tfv.get_feature_names(),
 corpus_svd.components_[sample_index]
 )
 )
 print(
 sorted(
 feature_scores,
 key=feature_scores.get,
 reverse=True
 )[:

In [None]:
# clean data to make sense
import re
import string
def clean_text(s):
 """
 This function cleans the text a bit
 :param s: string
 :return: cleaned string
 """
# split by all whitespaces
 s = s.split()

 # join tokens by single space
 # why we do this?
 # this will remove all kinds of weird space
 # "hi. how are you" becomes
 # "hi. how are you"
 s = " ".join(s)

 # remove all punctuations using regex and string module
 s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

 # you can add more cleaning here if you want
 # and then return the cleaned string
 return s


In [None]:
# “hi, how are you????” to “hi how are you”

In [None]:
import pandas as pd

corpus = pd.read_csv("../input/cleaned-imdb/imdb_clean.csv", nrows=10000)
corpus.loc[:, "reviews"] = corpus.review.apply(clean_text)

In [None]:
#make it even better by removing stopwords in your cleaning function. What are
#stopwords? These are high-frequency words that exist in every language. For
#example, in the English language, these words are “a”, “an”, “the”, “for”, etc.

## word embeddings

You have seen that till now we
converted the tokens into numbers. So, if there are N unique tokens in a given
corpus, they can be represented by integers ranging from 0 to N-1. Now we will
represent these integer tokens with vectors. This representation of words into
vectors is known as word embeddings or word vectors. Google’s Word2Vec is one
of the oldest approaches to convert words into vectors. We also have FastText from
Facebook and GloVe (Global Vectors for Word Representation) from Stanford.
These approaches are quite different from each other. 

In [None]:
import numpy as np
def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    """
     Given a sentence and other information,
     this function returns embedding for the whole sentence
     :param s: sentence, string
     :param embedding_dict: dictionary word:vector
     :param stop_words: list of stop words, if any
     :param tokenizer: a tokenization function
     """
     # convert sentence to string and lowercase it
    words = str(s).lower()

     # tokenize the sentence
    words = tokenizer(words)

     # remove stop word tokens
    words = [w for w in words if not w in stop_words]

     # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]

     # initialize empty list to store embeddings
    M = []
    for w in words:
     # for evert word, fetch the embedding from
     # the dictionary and append to list of
     # embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])

     # if we dont have any vectors, return zero
    if len(M) == 0:
        return np.zeros(300)
     # convert list of embeddings to array
    M = np.array(M)

     # calculate sum over axis=0
    v = M.sum(axis=0)
     # return normalized vector
    return v / np.sqrt((v ** 2).sum())


In [None]:
# fasttext.py
import io
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(
     fname,
     'r',
    encoding='utf-8',
     newline='\n',
     errors='ignore'
     )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [None]:
def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):

if __name__ == "__main__":
    # read the training data
    df = pd.read_csv("../input/cleaned-imdb/imdb_clean.csv")
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
  lambda x: 1 if x == "positive" else 0
 )
 # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
 # load embeddings into memory
    print("Loading embeddings")
    embeddings = load_vectors("../input/crawl-300d-2M.vec")
 # create sentence embeddings
    print("Creating sentence vectors")
    vectors = []
    for review in df.review.values:
        vectors.append(
         sentence_to_vec(
         s = review,
         embedding_dict = embeddings,
         stop_words = [],
         tokenizer = word_tokenize
     )
 )

    vectors = np.array(vectors)
    # fetch labels
    y = df.sentiment.values

 # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

 # fill the new kfold column
    for fold_, (t_, v_) in enumerate(kf.split(X=vectors, y=y)):
        print(f"Training fold: {fold_}")
 # temporary dataframes for train and test
        xtrain = vectors[t_, :]
        ytrain = y[t_]
        xtest = vectors[v_, :]
        ytest = y[v_]
 # initialize logistic regression model
        model = linear_model.LogisticRegression()
 # fit the model on training data reviews and sentimen
        model.fit(xtrain, ytrain)
 # make predictions on test data
 # threshold for predictions is 0.5
        preds = model.predict(xtest)
 # calculate accuracy
        accuracy = metrics.accuracy_score(ytest, preds)
        print(f"Accuracy = {accuracy}")
        print("")


# LSTM

In [None]:
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection
if __name__ == "__main__":
 # Read training data
 df = pd.read_csv("../input/cleaned-imdb/imdb_clean.csv")
 # map positive to 1 and negative to 0
 df.sentiment = df.sentiment.apply(
 lambda x: 1 if x == "positive" else 0
 )
 # we create a new column called kfold and fill it with -1
 df["kfold"] = -1
# the next step is to randomize the rows of the data
 df = df.sample(frac=1).reset_index(drop=True)

 # fetch labels
 y = df.sentiment.values

 # initiate the kfold class from model_selection module
 kf = model_selection.StratifiedKFold(n_splits=5)

 # fill the new kfold column
 for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
 df.loc[v_, 'kfold'] = f

 # save the new csv with kfold column
 df.to_csv("../input/imdb_folds.csv", index=False)


In [None]:
import torch
class IMDBDataset:
 def __init__(self, reviews, targets):
 """
 :param reviews: this is a numpy array
 :param targets: a vector, numpy array
 """
 self.reviews = reviews
 self.target = targets
 def __len__(self):
 # returns length of the dataset
 return len(self.reviews)

 def __getitem__(self, item):
 # for any given item, which is an int,
 # return review and targets as torch tensor
 # item is the index of the item in concern
 review = self.reviews[item, :]
 target = self.target[item]
 return {"review": torch.tensor(review, dtype=torch.long),
 "target": torch.tensor(target, dtype=torch.float)
 }


In [None]:
import torch
import torch.nn as nn
class LSTM(nn.Module):
 def __init__(self, embedding_matrix):
 """
 :param embedding_matrix: numpy array with vectors for all words
 """
 super(LSTM, self).__init__()
 # number of words = number of rows in embedding matrix
 num_words = embedding_matrix.shape[0]
 # dimension of embedding is num of columns in the matrix
 embed_dim = embedding_matrix.shape[1]
 # we define an input embedding layer
 self.embedding = nn.Embedding(
 num_embeddings=num_words,
 embedding_dim=embed_dim
 )
 # embedding matrix is used as weights of
 # the embedding layer
 self.embedding.weight = nn.Parameter(
 torch.tensor(
 embedding_matrix,
 dtype=torch.float32
 )
 )
 # we dont want to train the pretrained embeddings
 self.embedding.weight.requires_grad = False
 # a simple bidirectional LSTM with
 # hidden size of 128
self.lstm = nn.LSTM(
 embed_dim,
 128,
 bidirectional=True,
 batch_first=True,
 )
 # output layer which is a linear layer
 # we have only one output
 # input (512) = 128 + 128 for mean and same for max pooling
 self.out = nn.Linear(512, 1)
 def forward(self, x):
 # pass data through embedding layer
 # the input is just the tokens
 x = self.embedding(x)
 # move embedding output to lstm
 x, _ = self.lstm(x)
 # apply mean and max pooling on lstm output
 avg_pool = torch.mean(x, 1)
 max_pool, _ = torch.max(x, 1)

 # concatenate mean and max pooling
 # this is why size is 512
 # 128 for each direction = 256
 # avg_pool = 256 and max_pool = 256
 out = torch.cat((avg_pool, max_pool), 1)
 # pass through the output layer and return the output
 out = self.out(out)
 # return linear output
 return out


In [None]:
import torch
import torch.nn as nn
def train(data_loader, model, optimizer, device):
    # set model to training mode
 model.train()
 # go through batches of data in data loader
 for data in data_loader:
 # fetch review and target from the dict
 reviews = data["review"]
 targets = data["target"]
 # move the data to device that we want to use
 reviews = reviews.to(device, dtype=torch.long)
 targets = targets.to(device, dtype=torch.float)
 # clear the gradients
 optimizer.zero_grad()
 # make predictions from the model
 predictions = model(reviews)
 # calculate the loss
 loss = nn.BCEWithLogitsLoss()(
 predictions,
 targets.view(-1, 1)
 )
 # compute gradient of loss w.r.t.
 # all parameters of the model that are trainable
 loss.backward()
 # single optimization step
 optimizer.step()
def evaluate(data_loader, model, device):
 # initialize empty lists to store predictions
 # and targets
 final_predictions = []
 final_targets = []
 # put the model in eval mode
model.eval()
 # disable gradient calculation
 with torch.no_grad():
 for data in data_loader:
 reviews = data["review"]
 targets = data["target"]
 reviews = reviews.to(device, dtype=torch.long)
 targets = targets.to(device, dtype=torch.float)
 # make predictions
 predictions = model(reviews)
 # move predictions and targets to list
 # we need to move predictions and targets to cpu too
 predictions = predictions.cpu().numpy().tolist()
 targets = data["target"].cpu().numpy().tolist()
 final_predictions.extend(predictions)
 final_targets.extend(targets)
 # return final predictions and targets
 return final_predictions, final_targets
    

In [None]:
import io
import torch
import numpy as np
import pandas as pd
# yes, we use tensorflow
# but not for training the model!
import tensorflow as tf
from sklearn import metrics
import config
import dataset
import engine
import lstm
def load_vectors(fname):
 # taken from: https://fasttext.cc/docs/en/english-vectors.html
fin = io.open(
 fname,
 'r',
 encoding='utf-8',
 newline='\n',
 errors='ignore'
 )
 n, d = map(int, fin.readline().split())
 data = {}
 for line in fin:
 tokens = line.rstrip().split(' ')
 data[tokens[0]] = list(map(float, tokens[1:]))
 return data
def create_embedding_matrix(word_index, embedding_dict):
 """
 This function creates the embedding matrix.
 :param word_index: a dictionary with word:index_value
 :param embedding_dict: a dictionary with word:embedding_vector
 :return: a numpy array with embedding vectors for all known words
 """
 # initialize matrix with zeros
 embedding_matrix = np.zeros((len(word_index) + 1, 300))
 # loop over all the words
 for word, i in word_index.items():
 # if word is found in pre-trained embeddings,
 # update the matrix. if the word is not found,
 # the vector is zeros!
 if word in embedding_dict:
 embedding_matrix[i] = embedding_dict[word]
 # return embedding matrix
 return embedding_matrix
def run(df, fold):
 """
 Run training and validation for a given fold
 and dataset
 :param df: pandas dataframe with kfold column
 :param fold: current fold, int
 """
 # fetch training dataframe
 train_df = df[df.kfold != fold].reset_index(drop=True)
 # fetch validation dataframe
valid_df = df[df.kfold == fold].reset_index(drop=True)
 print("Fitting tokenizer")
 # we use tf.keras for tokenization
 # you can use your own tokenizer and then you can
 # get rid of tensorflow
 tokenizer = tf.keras.preprocessing.text.Tokenizer()
 tokenizer.fit_on_texts(df.review.values.tolist())
 # convert training data to sequences
 # for example : "bad movie" gets converted to
 # [24, 27] where 24 is the index for bad and 27 is the
 # index for movie
 xtrain = tokenizer.texts_to_sequences(train_df.review.values)
 # similarly convert validation data to
 # sequences
 xtest = tokenizer.texts_to_sequences(valid_df.review.values)
 # zero pad the training sequences given the maximum length
 # this padding is done on left hand side
 # if sequence is > MAX_LEN, it is truncated on left hand side too
 xtrain = tf.keras.preprocessing.sequence.pad_sequences(
 xtrain, maxlen=config.MAX_LEN
 )
 # zero pad the validation sequences
 xtest = tf.keras.preprocessing.sequence.pad_sequences(
 xtest, maxlen=config.MAX_LEN
 )
 # initialize dataset class for training
 train_dataset = dataset.IMDBDataset(
 reviews=xtrain,
 targets=train_df.sentiment.values
 )
 # create torch dataloader for training
 # torch dataloader loads the data using dataset
 # class in batches specified by batch size
 train_data_loader = torch.utils.data.DataLoader(
 train_dataset,
 batch_size=config.TRAIN_BATCH_SIZE,
 num_workers=2
 )
 # initialize dataset class for validation
valid_dataset = dataset.IMDBDataset(
 reviews=xtest,
 targets=valid_df.sentiment.values
 )

 # create torch dataloader for validation
 valid_data_loader = torch.utils.data.DataLoader(
 valid_dataset,
 batch_size=config.VALID_BATCH_SIZE,
 num_workers=1
 )
 print("Loading embeddings")
 # load embeddings as shown previously
 embedding_dict = load_vectors("../input/crawl-300d-2M.vec")
 embedding_matrix = create_embedding_matrix(
 tokenizer.word_index, embedding_dict
 )
 # create torch device, since we use gpu, we are using cuda
 device = torch.device("cuda")
 # fetch our LSTM model
 model = lstm.LSTM(embedding_matrix)
 # send model to device
 model.to(device)

 # initialize Adam optimizer
 optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 print("Training Model")
 # set best accuracy to zero
 best_accuracy = 0
 # set early stopping counter to zero
 early_stopping_counter = 0
 # train and validate for all epochs
 for epoch in range(config.EPOCHS):
 # train one epoch
 engine.train(train_data_loader, model, optimizer, device)
 # validate
 outputs, targets = engine.evaluate(
 valid_data_loader, model, device
 )
 # use threshold of 0.5
 # please note we are using linear layer and no sigmoid
# you should do this 0.5 threshold after sigmoid
 outputs = np.array(outputs) >= 0.5
 # calculate accuracy
 accuracy = metrics.accuracy_score(targets, outputs)
 print(
 f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}"
 )
 # simple early stopping
 if accuracy > best_accuracy:
 best_accuracy = accuracy
 else:
 early_stopping_counter += 1
 if early_stopping_counter > 2:
 break
if __name__ == "__main__":
 # load data
 df = pd.read_csv("../input/imdb_folds.csv")
 # train for all folds
 run(df, fold=0)
 run(df, fold=1)
 run(df, fold=2)
 run(df, fold=3)
 run(df, fold=4)


In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10