In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
new_model = tf.keras.models.load_model('WeightedModel2\\tf_cnnmodel')
new_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 390, 64)           1280064   
_________________________________________________________________
conv1d (Conv1D)              (None, 389, 128)          16512     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
flatten (Flatten)            (None, 128)               0         
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0

In [4]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [8]:
test_data = pd.read_csv('data_test.csv', converters={'reviews' : str})
# test_data.drop(["Date", "Content"], axis=1, inplace=True)
# test_data.fillna(0)
# test_data = test_data.fillna("")
test_data.sample(5)

Unnamed: 0,Date,Content,review,scores,compound,sentiment
1309,16-Aug,We arrived to the beach around 4;30pm and the ...,arrive around 430pm buzzing pull beanbag drank...,"{'neg': 0.0, 'neu': 0.628, 'pos': 0.372, 'comp...",0.7964,1
1545,15-Nov,i love the seminyak beach...the sands...the co...,love seminyak beachthe sandsthe colourful plac...,"{'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'comp...",0.8591,1
1145,16-Mar,Fantastic beach. Cleaner after the Grand Hyatt...,fantastic clean grand hyatt unfortunately see ...,"{'neg': 0.284, 'neu': 0.401, 'pos': 0.315, 'co...",0.2732,1
1693,16-Sep,Seminyak beach was always packed with tourists...,seminyak always pack tourist water cold hire c...,"{'neg': 0.0, 'neu': 0.616, 'pos': 0.384, 'comp...",0.6808,1
2048,17-May,Such a lovely beach to relax and watch the sun...,lovely relax watch sun go near busy kuta vibe ...,"{'neg': 0.019, 'neu': 0.649, 'pos': 0.333, 'co...",0.979,1


In [9]:
try:
    x_test  = np.array( tokenizer.texts_to_sequences(test_data['review'].tolist()) )
    # x_test = tokenizer
    x_test = pad_sequences(x_test, padding='post', maxlen=1210)
except AttributeError:
    pass

  


In [10]:
# Generate predictions (probabilities -- the output of the last layer)
# on test  data using `predict`
print("Generate predictions for all samples")
predictions = new_model.predict(x_test)
print(predictions)
predict_results = predictions.argmax(axis=1)

Generate predictions for all samples
[[0.09216475 0.8810382 ]
 [0.05293751 0.928354  ]
 [0.06831741 0.90956306]
 ...
 [0.10228038 0.87140906]
 [0.6566642  0.34539357]
 [0.20985025 0.7640997 ]]


In [11]:
test_data['pred_sentiment'] = predict_results
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment == 0),'0',test_data.pred_sentiment)
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment == '1'),'1',test_data.pred_sentiment)

In [12]:
labels = ['0', '1']
    
print(classification_report(test_data['sentiment'].values.astype(int).astype(str),test_data['pred_sentiment'].values,labels=labels))

              precision    recall  f1-score   support

           0       0.54      0.56      0.55       222
           1       0.95      0.94      0.95      1905

    accuracy                           0.90      2127
   macro avg       0.74      0.75      0.75      2127
weighted avg       0.91      0.90      0.90      2127



In [13]:
confusion_matrix = pd.crosstab(test_data['sentiment'], test_data['pred_sentiment'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted    0     1
Actual              
0          124    98
1          107  1798


In [14]:
roc_auc = roc_auc_score(test_data['sentiment'], test_data['pred_sentiment'])

print("ROC AUC Score")
print(roc_auc)

ROC AUC Score
0.751195289777967
