In [1]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.27.2.tar.gz (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 96 kB/s 
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 57.5 MB/s 
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 47.0 MB/s 
Collecting cchardet
  Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 58.8 MB/s 
Collecting syntok
  Downloading syntok-1.3.1.tar.gz (23 kB)
Collecting seqeval==0.0.19
  Downloading seqeval-0.0.19.tar.gz (30 kB)
Collecting transformers<=4.3.3,>=4.0.0
  Downloading transformers-4.3.3-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 47.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import ktrain
from ktrain import text

In [10]:
from google.colab import files
uploaded = files.upload()

Saving test.xlsx to test.xlsx
Saving train.xlsx to train.xlsx


In [11]:
data_train = pd.read_excel("/content/train.xlsx",dtype = str)
data_test = pd.read_excel("/content/test.xlsx",dtype = str)

In [13]:
#Dimension of the dataset
print("Size of Train dataset:",data_train.shape)
print("Size of Test dataset:",data_test.shape)

Size of Train dataset: (25000, 2)
Size of Test dataset: (25000, 2)


In [14]:
#Last rows of Train Dataset
data_train.tail()

Unnamed: 0,Reviews,Sentiment
24995,Everyone plays their part pretty well in this ...,pos
24996,It happened with Assault on Prescient 13 in 20...,neg
24997,My God. This movie was awful. I can't complain...,neg
24998,"When I first popped in Happy Birthday to Me, I...",neg
24999,"So why does this show suck? Unfortunately, tha...",neg


In [16]:
#Top rows of Test Dataset
data_test.head()

Unnamed: 0,Reviews,Sentiment
0,Who would have thought that a movie about a ma...,pos
1,After realizing what is going on around us ......,pos
2,I grew up watching the original Disney Cindere...,neg
3,David Mamet wrote the screenplay and made his ...,pos
4,"Admittedly, I didn't have high expectations of...",neg


Splitting Data Into Train and Test

In [19]:
# text.texts_from_df return two tuples
# maxlen means it is considering that much words and rest are getting trucated
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)


(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'Reviews',
                                                                   label_columns = 'Sentiment',
                                                                   val_df = data_test,
                                                                   maxlen = 500,
                                                                   preprocess_mode = 'bert')

['neg', 'pos']
   neg  pos
0  1.0  0.0
1  1.0  0.0
2  1.0  0.0
3  1.0  0.0
4  1.0  0.0
['neg', 'pos']
   neg  pos
0  0.0  1.0
1  0.0  1.0
2  1.0  0.0
3  0.0  1.0
4  1.0  0.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


Observation:

1.We can see that it is detecting language as an English.
2.This is not a multilabel classification.

In [20]:
# name = "bert" means, here we are using BERT model.

model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

Is Multi-Label? False
maxlen is 500
done.


In [21]:
#here we are taking batch size as 6 as from the documentation it is recommend to use this with maxlen as 500

learner = ktrain.get_learner(model=model, train_data=(X_train, y_train),
                   val_data = (X_test, y_test),
                   batch_size = 6)

In [22]:
#Essentially fit is a very basic training loop, whereas fit one cycle uses the one cycle policy callback

learner.fit_onecycle(lr = 2e-5, epochs = 1)





begin training using onecycle policy with max lr of 2e-05...

KeyboardInterrupt: ignored

In [23]:
predictor = ktrain.get_predictor(learner.model, preproc)
#predictor.save('/content/drive/My Drive/bert')

In [24]:
#sample dataset to test on

data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the fild is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [25]:
predictor.predict(data)

['neg', 'neg', 'pos']

In [27]:
predictor.save('/content/bert')



Intepretation of above results :

1.‘this movie was horrible, the plot was really boring. acting was okay’ – neg
2.‘the fild is really sucked. there is not plot and acting was bad’ – neg
3.‘what a beautiful movie. great plot. acting was good. will see it again’ – pos

In [28]:
#return_proba = True means it will give the prediction probabilty for each class

predictor.predict(data, return_proba=True)

array([[0.9960681 , 0.00393183],
       [0.9965828 , 0.00341717],
       [0.00181624, 0.9981838 ]], dtype=float32)

In [29]:
#classes available
predictor.get_classes()

['neg', 'pos']

In [30]:
#loading the model
predictor_load = ktrain.load_predictor('/content/bert')

In [31]:
#predicting the data
predictor_load.predict(['Movie was awesome i Just loved it.'])

['pos']