In [42]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
!pip install -q tf-models-official==2.7.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.9/238.9 KB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.1/352.1 KB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.9 MB/s[0m eta

In [3]:
import tensorflow_text as text
from keras.models import load_model
from official.nlp import optimization

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
class model:
  def __init__(self , file="data/airline_sentiment_analysis.csv"):
    temd = pd.read_csv(file)
    temd['airline_sentiment'].loc[temd['airline_sentiment']=='positive']=1
    temd['airline_sentiment'].loc[temd['airline_sentiment']=='negative']=0
    self.data = temd
    self.data['text'] = self.data['text'].apply(self.text_cleaner)

  def print_confusion(self,tp,tn,fp,fn):
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    f1_score = (2*recall*precision)/(recall+precision)

    print("recall: ", recall)
    print("precision: ",precision)
    print("accuracy ",accuracy)
    print("f1_score ",f1_score)

  def vectoriser_initialise(self, estimators, random_seed):
    cv=CountVectorizer(stop_words='english')
    self.X=cv.fit_transform(self.data['text']).toarray()
    self.y=self.data['airline_sentiment'].astype('int64')
    self.vectoriser_model = RandomForestClassifier(n_estimators = estimators, criterion = 'entropy', random_state = random_seed)
    self.cv = cv

  def vectoriser_fit(self):
    self.vectoriser_model.fit(self.X_train, self.y_train)
    
  def vectoriser_metrics(self):
    y_pred = self.vectoriser_model.predict(self.X_test)
    cm = confusion_matrix(self.y_test, y_pred)
    tp=cm[0][0]
    tn=cm[1][1]
    fp=cm[0][1]
    fn=cm[1][0]
    self.print_confusion(tp=tp,tn=tn,fp=fp,fn=fn)

  def bert_initialise(self):
    self.X = self.data['text']
    self.y = self.data['airline_sentiment'].astype('int64')

    bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

    map_name_to_handle = {
        'small_bert/bert_en_uncased_L-4_H-512_A-8':
            'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
    }

    map_model_to_preprocess = {
        'small_bert/bert_en_uncased_L-4_H-512_A-8':
            'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
    }

    tfhub_handle_encoder = map_name_to_handle[bert_model_name]
    tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    bert_model = hub.KerasLayer(tfhub_handle_encoder)   

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocess_model(text_input)
    outputs = bert_model(preprocessed_text)
    self.tfhub_handle_preprocess = tfhub_handle_preprocess
    self.tfhub_handle_encoder = tfhub_handle_encoder
    # Neural network layers
    l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
    # Use inputs and outputs to construct a final model
    self.bert_model = tf.keras.Model(inputs=[text_input], outputs = [l])


  def text_cleaner(self, tweet):
    tweet = re.sub('@[\w]+','',tweet)   #removes username handles
    tweet = re.sub(r"http\S+", "",tweet) #removes links/urls
    tweet = re.sub(r'#', '', tweet)       #removes "#"
    tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', tweet) #removes repeating characters and replaces with single character

    tweet = re.sub(r' 0 ', 'zero', tweet)        #only number allowed is zero in alphabet form, all other omitted
    tweet = re.sub(r'[^A-Za-z ]', '', tweet)

    tweet = tweet.lower()
    return tweet

  def split(self, test_size , random_seed):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size= test_size, random_state=random_seed)
    self.X_train = X_train
    self.X_test = X_test
    self.y_train = y_train
    self.y_test = y_test

  def bert_fit(self, epochs, batches):
    self.bert_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = self.bert_model.fit(self.X_train, self.y_train, epochs= epochs, batch_size = batches)


  def bert_metrics(self):
    y_pred = self.bert_model.predict(self.X_test)
    y_pred = y_pred.flatten()
    tp =0
    fp =0
    tn =0
    fn=0
    print(len(y_pred))
    print(len(self.y_test))
    for i in range(len(y_pred)):

      if(y_pred[i]>0.5):
        if(self.y_test.iloc[i]==0):
          fp+=1
        else:
          tp+=1
      else:
        if(self.y_test.iloc[i]==0):
          tn+=1
        else:
          fn+=1
    self.print_confusion(tp,tn,fp,fn)

In [41]:
data_file = "data/airline_sentiment_analysis.csv"
bert_model = model(data_file)
bert_model.bert_initialise()
bert_model.split(0.2 , 42)                  #(test_size , random_seed)
bert_model.bert_fit(4 , 75)                 #(epochs , batch_size)
bert_model.bert_metrics()

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
2309
2309
recall:  0.5503355704697986
precision:  0.8945454545454545
accuracy  0.9003897791251624
f1_score  0.6814404432132964


In [31]:
data_file = "data/airline_sentiment_analysis.csv"
randomf_model = model(data_file)
randomf_model.vectoriser_initialise(150, 0) #(n_estimators , random_seed)
randomf_model.split(0.2 , 42)               #(test_size , random_seed)
randomf_model.vectoriser_fit()
randomf_model.vectoriser_metrics()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


recall:  0.9174782162993337
precision:  0.9613319011815252
accuracy  0.899090515374621
f1_score  0.938893259900341


In [32]:
pickle_out = open("trained_models/randomf_model.pkl" , "wb")
pickle.dump(randomf_model.vectoriser_model , pickle_out )
pickle_out.close()

In [33]:
cv_out = open("trained_models/vectors.pkl" , "wb")
pickle.dump(randomf_model.cv , cv_out)
pickle_out.close()

In [44]:
bert_out = open("trained_models/bert_model.pkl" , "wb")
pickle.dump(bert_model.bert_model , bert_out )
bert_out.close()

