<a href="https://colab.research.google.com/github/serdarbozoglan/My_NLP/blob/master/imdb_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt install --quiet tree
!pip install tensorflow-gpu==2.0.0 -q

Reading package lists...
Building dependency tree...
Reading state information...
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 32 not upgraded.
Need to get 40.7 kB of archives.
After this operation, 105 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]
Fetched 40.7 kB in 0s (87.1 kB/s)
Selecting previously unselected package tree.
(Reading database ... 145605 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
[K     |████████████████████████████████| 380.8MB 39kB/s 
[K     |████████████████████████████████| 450kB 18.0MB/s 
[K     |███████████████████████████████

In [3]:
import tensorflow as tf
print(tf.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

2.0.0
GPU is NOT AVAILABLE


In [0]:
import pandas as pd
import glob
import os
import time
import string

# Data Acquisition

In [4]:
!wget -q http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar zxf aclImdb_v1.tar.gz
!tree -d aclImdb

aclImdb
├── test
│   ├── neg
│   └── pos
└── train
    ├── neg
    ├── pos
    └── unsup

7 directories


### Define Global Parameters

In [0]:
NUM_WORDS = 8000
SEQ_LEN = 128
EMBEDDING_SIZE = 128
BATCH_SIZE = 128
EPOCHS = 5
THRESHOLD = 0.5

In [0]:
def get_dfs(start_path):
    
    df = pd.DataFrame(columns=['text', 'sentiment'])
    text  = []
    sentiment = []
    
    for p in ['pos', 'neg']:
        path = os.path.join(start_path, p)
        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        
        for f in files:
            with open(os.path.join(path, f), "r") as myfile:
            
                text.append(myfile.read().replace("\n", " ").replace("\r", " "))
                #convert positive reviews to 1  and negatives to 0
                sentiment.append(1 if p == 'pos' else 0)
                
    df['text'] = text
    df['sentiment'] = sentiment
     
    ## this is used to shuffle data to avoid getting sequentially of pos or neg
    
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [0]:
train_df = get_dfs("aclImdb/train/")
test_df  = get_dfs("aclImdb/test/")

In [48]:
train_df.head()

Unnamed: 0,text,sentiment
0,I was not entirely impressed by this film. It ...,0
1,Kurt Russell is so believable and the action s...,1
2,This movie was horrible. I swear they didn't e...,0
3,Just don't bother. I thought I would see a mov...,0
4,I regret that I've seen this movie. Can't beli...,0


In [0]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [50]:
train_df.head()

Unnamed: 0,text,sentiment
0,"Ah, the best and funniest movie about female f...",1
1,Leave it to Braik to put on a good show. Final...,1
2,"I recently got the chance to view ""The Waterda...",1
3,I rented this movie tonight because it looked ...,0
4,The first Cube movie was an art movie. It set ...,0


### Tokenize the data

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token="<UNK>")

In [0]:
tokenizer.fit_on_texts(train_df['text'])

In [0]:
# convert text data to numeric form

train_seqs = tokenizer.texts_to_sequences(train_df['text'])
test_seqs = tokenizer.texts_to_sequences(test_df['text'])

In [55]:
print(train_seqs[0])

[3790, 2, 116, 3, 1526, 18, 42, 666, 2316, 449, 62, 1073, 126, 72, 2, 6437, 4288, 5, 1665, 1, 268, 9, 2388, 1312, 1, 32, 2, 94, 11, 41, 294, 12, 21, 20, 468, 5035, 1686, 6355, 3, 242, 1, 59, 5121, 6, 268, 2, 1, 8, 8, 12, 7, 712, 59, 331, 1637, 2316, 18, 101, 1947, 1, 1823, 3023, 19, 12, 56, 17, 2, 1281, 1006, 5, 265, 38, 4, 1, 416, 5, 18, 229, 37, 4, 766, 35, 682, 399, 229, 100, 61, 24, 4011, 9, 2, 702, 9, 61, 34, 24, 91, 41, 102, 42, 2, 225, 119, 2, 537, 24, 621, 37, 2, 7365, 41, 15, 2, 2736, 7, 954, 700, 3384, 30, 2155, 8, 8, 10, 7, 1279, 16, 4, 18, 6, 95, 70, 460, 44, 1290, 19, 52, 2, 3951, 1628, 1, 29, 5, 2, 537, 81, 2, 1, 135, 5746, 39, 6, 2634, 33, 1, 2374, 91, 37, 4, 3735, 5, 1, 1, 11, 424, 568, 7577, 2, 406, 9, 2, 311, 17, 59, 1, 259, 52, 2, 183, 1628, 7181, 2, 1, 2007, 16, 7917, 8, 8, 1384, 5, 2, 1, 11, 41, 655, 11, 98, 1126, 1, 36, 13, 11, 98, 155, 44, 2, 485, 21, 2, 3476, 3397, 420, 48, 69, 4, 169, 1, 9, 4078, 227, 19, 34, 667, 5400, 6, 1, 296, 736, 4717, 8, 8, 80, 21, 2, 39

In [0]:
# pad data up to SEQ_LEN (we will truncate if there are more than SEQ_LEN tokens)
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_seqs = pad_sequences(train_seqs, maxlen=SEQ_LEN, padding='post')
test_seqs = pad_sequences(test_seqs, maxlen=SEQ_LEN, padding='post')

## **Model Selection**

we will be creating an extremely simple Neural Network containing one layer. It's always best to start simple and we can improve upon it later.

In [0]:
from tensorflow.keras.layers import Dense, Embedding, GlobalAvgPool1D, Dropout

In [58]:
model = tf.keras.Sequential([
                             Embedding(NUM_WORDS, EMBEDDING_SIZE),
                             GlobalAvgPool1D(),
                             Dense(1, activation='sigmoid')
])

model.summary()
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         1024000   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,024,129
Trainable params: 1,024,129
Non-trainable params: 0
_________________________________________________________________


## Train Model
Here we'll feed our processed data into our model to train it.

In [59]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
callbacks = [es]
history = model.fit(train_seqs, train_df['sentiment'].values, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, callbacks=callbacks)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate Model
Here, we'll feed our trained model data that it hasn't seen before (our test set we created earlier). Even our simple model was able to predict sentiment with 85% accuracy.

In [60]:
model.evaluate(test_seqs, test_df['sentiment'].values)[1]



0.85732

## **Save Model**
Now that we have created and train our model, we can save it for use on new incoming data

In [0]:
import pickle
model.save('model.h5')

#saving
with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

del model
del tokenizer

# Step 6: Evaluate Next Steps
## In this case, our accuracy is pretty good, but there are still some things we can do to improve it.  Here are some ideas(All of which concretely.ai can assist with):
1. Feed more training data.
2. Tweak the global variables we defined above.  For example, use a bigger vocabulary or change our positive/negative threshold.
3. Add more nodes/layers to our model.
4. Utilize other NLP libraries such as Spacy or NLTK to optimize our Tokenization using lemmatization and stemming.
5. Explore more complex models such as Bert, CNNs and RNNs.

## The most import thing to remember is that you now have a process in place and you should feel free to experiment.

In [0]:
loaded_model = tf.keras.models.load_model('model.h5')
with open('tokenizer.pickle', 'rb') as file:
  loaded_tokenizer = pickle.load(file)

In [0]:
def prepare_predict_data(tokenizer, reviews):
  seqs = tokenizer.texts_to_sequences(reviews)
  seqs = pad_sequences(seqs, maxlen=SEQ_LEN, padding='post')
  return seqs

In [0]:
my_reviews=['this movie was awesome',
           'this movie was the worst movie ive ever seen',
           'i hated everything about this movie',
           'this is my favorite movie of the year']

my_seqs = prepare_predict_data(loaded_tokenizer, my_reviews)

In [0]:
preds = loaded_model.predict(my_seqs)

In [0]:
pred_df = pd.DataFrame(columns=['text', 'sentiment'])
pred_df['text'] = my_reviews
pred_df['sentiment'] = preds

In [74]:
pred_df

Unnamed: 0,text,sentiment
0,this movie was awesome,0.594764
1,this movie was the worst movie ive ever seen,0.392715
2,i hated everything about this movie,0.50787
3,this is my favorite movie of the year,0.688303


In [0]:
pred_df['sentiment'] = pred_df['sentiment'].apply(lambda x: 'pos' if x > THRESHOLD else 'neg')

In [79]:
pred_df

Unnamed: 0,text,sentiment
0,this movie was awesome,pos
1,this movie was the worst movie ive ever seen,neg
2,i hated everything about this movie,pos
3,this is my favorite movie of the year,pos


# Experimental
### Clean up our data before feeding model

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [0]:
punctuations = string.punctuation
stopwords = nlp.Defaults.stop_words

In [84]:
len(stopwords)

326

In [0]:
# Define function to cleanup text by removing PERSONAL PRONOUNS, stopwords, and puncuation
def cleanup_text(docs, logging=False):
  texts = []
  counter = 1

  for doc in docs:
    if counter % 1000 == 0 and logging:
      print (f"Processed {counter} out of {len(docs)}")
    counter += 1
    doc = nlp(doc, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    tokens = ' '.join(tokens)
    texts.append(tokens)
  return pd.Series(texts)


In [0]:
train_X = cleanup_text(train_df['text'])
test_X = cleanup_text(test_df['text'])

In [90]:
train_X.head()

0    ah good funniest movie female football fan sli...
1    leave braik good finally zorak live life outsi...
2    recently chance view waterdance like understan...
3    rent movie tonight look like fun movie figure ...
4    cube movie art movie set world major archetype...
dtype: object

In [0]:
## create tokenizer

tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train_X)

In [0]:
## convert text data to numerical indexes
train_seqs = tokenizer.texts_to_sequences(train_X)
test_seqs = tokenizer.texts_to_sequences(test_X)


In [0]:
#pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
train_seqs = pad_sequences(train_seqs, maxlen=SEQ_LEN, padding='post')
test_seqs = pad_sequences(test_seqs, maxlen=SEQ_LEN, padding='post')

In [0]:
model = tf.keras.Sequential([
                             Embedding(NUM_WORDS, EMBEDDING_SIZE),
                             GlobalAvgPool1D(),
                             Dense(1024),
                             Dropout(0.5),
                             Dense(1, activation='sigmoid')
])

In [98]:
model.summary()
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1024000   
_________________________________________________________________
global_average_pooling1d_3 ( (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              132096    
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1025      
Total params: 1,157,121
Trainable params: 1,157,121
Non-trainable params: 0
_________________________________________________________________


In [0]:
embed_size = 128


In [99]:
embed_size = 128
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences = True)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dense(20, activation="relu"))
model.add(tf.keras.layers.Dropout(0.45))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 128)         1024000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 64)          41216     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 21        
Total params: 1,066,537
Trainable params: 1,066,537
Non-trainable params: 0
____________________________________________

In [101]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
#callbacks=[es]
callbacks=[]
history = model.fit(train_seqs, train_df['sentiment'].values, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2, callbacks=callbacks)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [103]:
model.evaluate(test_seqs, test_df['sentiment'].values)[1]



0.83676

In [0]:
preds = model.predict(test_seqs)
ipreds = (preds > 0.5)

In [0]:
from sklearn.metrics import confusion_matrix


In [111]:
cm = confusion_matrix(ipreds, test_df['sentiment'])
cm

array([[10975,  2556],
       [ 1525,  9944]])

In [110]:
accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]) 
precision = cm[1][1] / (cm[1][1]+cm[0][1])
recall = cm[1][1] / (cm[1][1] + cm[1][0])
print("ACCURACY - We got the right answer {:.2%} of the time".format(accuracy))
print("PRECISION - Of the items we predictied to be positive {:.2%} actually were".format(precision))
print("RECALL - We identified {:.2%} of the items that were positive".format(recall))

ACCURACY - We got the right answer 83.68% of the time
PRECISION - Of the items we predictied to be positive 79.55% actually were
RECALL - We identified 86.70% of the items that were positive
