In [None]:
!pip install ktrain

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import ktrain
from ktrain import text
import tensorflow as tf
import seaborn as sns

In [None]:
tf.__version__

In [None]:
## loading  dataset

data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data.head()

In [None]:
sns.countplot(data["sentiment"])

In [None]:
TRAIN_SIZE = 40000
TEST_SIZE = 10000

data_train = data[:TRAIN_SIZE]
data_test = data[TRAIN_SIZE:].reset_index(drop=True)

In [None]:
data_train.head()

In [None]:
data_test.head()

In [None]:
sns.countplot(data_train["sentiment"])

In [None]:
sns.countplot(data_test["sentiment"])

In [None]:
#dimension of the dataset

print("Size of train dataset: ",data_train.shape)
print("Size of test dataset: ",data_test.shape)

### Splitting into train and test set

In [None]:
# maxlen means it is considering that much words and rest are getting trucated
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)


(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'sentiment',
                                                                   val_df = data_test,
                                                                   maxlen = 500,
                                                                   ngram_range=2,
                                                                   preprocess_mode = 'bert') # ngram_range = 2

In [None]:
len(X_train[1])

In [None]:
X_train[0].shape

## Model Building

In [None]:
# name = "bert" means, here we are using BERT model.

model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

In [None]:
#here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500

learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 6)

In [None]:
# To find the best lr, use below code, takes a day to train
# learner.lr_find()
# learner.lr_plot()

### Fitting The Model

In [None]:
#Essentially fit is a very basic training loop, whereas fit one cycle uses the one cycle policy callback

learner.fit_onecycle(lr = 2e-5, epochs = 1)

### Saving Model

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save("./models/sentiment_analysis")

## Prediction From Model

In [None]:
#sample dataset to test on

data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the fild is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [None]:
predictor.predict(data)

In [None]:
#return_proba = True means it will give the prediction probabilty for each class

predictor.predict(data, return_proba=True)

In [None]:
#classes available

predictor.get_classes()

### Load the model

In [None]:
#loading the model

predictor_load = ktrain.load_predictor("./models/sentiment_analysis")

In [None]:
#predicting the data

predictor_load.predict(data)

In [None]:
new_data = ["this movie is shit, feels like i have wasted my time", "best movie i have seen", "you are a good man"]
new_prediction = predictor_load.predict(new_data, return_proba=True)

for i, pred in enumerate(new_prediction):
    if np.argmax(pred) == 0:
        print(f"{new_data[i]} => {pred} => negative")
    else:
        print(f"{new_data[i]} => {pred}=> positive")

### Download Model

In [None]:
import os
os.chdir(r'./models/sentiment_analysis')

In [None]:
os.listdir()

### Model File Size

In [None]:
for file in os.listdir():
    print(f"{file}: {round(os.path.getsize(file)/1e+6,2)} MB")

#### Links to download model files
<a href="./models/sentiment_analysis/tf_model.h5"> Download h5 Model </a><br>
<a href="./models/sentiment_analysis/tf_model.preproc"> Download preproc Model </a>