In [3]:
# <-- Import Libraries -->

import pandas as pd
import numpy as np

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, precision_score, recall_score

from kerastuner import HyperModel
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from tensorflow.keras.models import load_model

import json
import pickle 
import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df = pd.read_csv('airline_sentiment_analysis.csv')  # Read Dataset

In [5]:
df

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...,...
11536,14633,negative,@AmericanAir my flight was Cancelled Flightled...
11537,14634,negative,@AmericanAir right on cue with the delays👌
11538,14635,positive,@AmericanAir thank you we got on a different f...
11539,14636,negative,@AmericanAir leaving over 20 minutes Late Flig...


In [6]:
df.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11541 entries, 0 to 11540
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   airline_sentiment  11541 non-null  object
 1   text               11541 non-null  object
dtypes: object(2)
memory usage: 180.5+ KB


### Check if data set is imbalanced or not

In [8]:
df['airline_sentiment'].value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

In [9]:
df['airline_sentiment'].value_counts(normalize=True)

negative    0.795252
positive    0.204748
Name: airline_sentiment, dtype: float64

### Handling Imbalanced Data set

In [10]:
mask = (df['airline_sentiment'] == 'positive')
temp_df = df[mask]

temp_df.head()

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."
5,positive,"@virginamerica Well, I didn't…but NOW I DO! :-D"
6,positive,"@VirginAmerica it was amazing, and arrived an ..."
7,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...


In [11]:
# Basic Text Cleaning

texts = list()

for review in temp_df['text']:
    
    # Remove Emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    review = emoji_pattern.sub(r'', review)
    
    patterns = ['(http:\/\/[a-z0-9]+\.[a-z]+\/[a-zA-Z0-9#]+\/?)',  # Links
                '@[A-za-z]+', # Mail
                '[^A-Za-z]', # Punctuation
               ]
    
    for patt in patterns:
        review = re.sub(pattern=patt, 
                        repl=' ', 
                        string=review)
    
    review = review.lower()
    
    texts.append(review)

### Text Augmentation (By finding Synonyms)

In [12]:
# Find Synonyms of the words

new_positive_reviews_list1 = list()
new_positive_reviews_list2 = list()

for text in texts:
    text = text.strip().split(' ')
    
    t_ls1 = list()
    t_ls2 = list()
    
    for character in text:
        if character != '':
            try:
                syns = wordnet.synsets(character)
                synonym1 = syns[0].lemmas()[0].name()
                synonym2 = syns[0].lemmas()[1].name()
                t_ls1.append(synonym1)
                t_ls2.append(synonym2)
            except Exception as e:
                t_ls1.append(character)
                t_ls2.append(character)

    review1 = ' '.join(t_ls1)
    review2 = ' '.join(t_ls2)
    
    new_positive_reviews_list1.append(review1)
    new_positive_reviews_list2.append(review2)

In [13]:
print(texts[0])
print(new_positive_reviews_list1[0])
print(new_positive_reviews_list2[0])

  plus you ve added commercials to the experience    tacky 
asset you ve added commercial to the experience tacky
plus you ve added commercial_message to the experience tacky


In [14]:
print(len(new_positive_reviews_list1))
print(len(new_positive_reviews_list2))

2363
2363


In [15]:
merged_dict = {
    'airline_sentiment': ['positive' for i in range(0, 2363+2363)],
    'text': new_positive_reviews_list1 + new_positive_reviews_list2
}

temp_d = pd.DataFrame(merged_dict) 
temp_d

Unnamed: 0,airline_sentiment,text
0,positive,asset you ve added commercial to the experienc...
1,positive,yes about every time iodine fly vx this ear wo...
2,positive,well iodine didn thymine merely now iodine bas...
3,positive,information_technology Washington amaze and ar...
4,positive,iodine lt pretty artwork sol much better than ...
...,...,...
4721,positive,love the new aeroplane for the Jack_Kennedy sl...
4722,positive,flight Evergreen_State great fantastic cabin c...
4723,positive,give_thanks you client dealings will revaluati...
4724,positive,thanks He is


In [16]:
# Merged the data

df = pd.concat([df, temp_d], ignore_index = True) 
df.reset_index()

Unnamed: 0,index,airline_sentiment,text
0,0,positive,@VirginAmerica plus you've added commercials t...
1,1,negative,@VirginAmerica it's really aggressive to blast...
2,2,negative,@VirginAmerica and it's a really big bad thing...
3,3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,4,positive,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...,...
16262,16262,positive,love the new aeroplane for the Jack_Kennedy sl...
16263,16263,positive,flight Evergreen_State great fantastic cabin c...
16264,16264,positive,give_thanks you client dealings will revaluati...
16265,16265,positive,thanks He is


In [17]:
df.airline_sentiment.value_counts()

negative    9178
positive    7089
Name: airline_sentiment, dtype: int64

In [18]:
df['airline_sentiment'].value_counts(normalize=True)

negative    0.56421
positive    0.43579
Name: airline_sentiment, dtype: float64

In [19]:
# Saved Balanced Augmented Data

df.to_csv('Balanced_Data.csv')

## Text Cleaning

In [20]:
corpus = list()

for review in df['text']:
    
    # Remove Emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    review = emoji_pattern.sub(r'', review)
    
    patterns = ['(http:\/\/[a-z0-9]+\.[a-z]+\/[a-zA-Z0-9#]+\/?)',  # Links
                '@[A-za-z]+', # Mail
                '[^A-Za-z]', # Punctuation
               ]
    
    for patt in patterns:
        review = re.sub(pattern=patt, 
                        repl=' ', 
                        string=review)
    
    review = review.lower()  # Lower case 
    review = review.split()  # List of each words
        
    # StopWords
    all_stopwords = set(stopwords.words('english'))  # Set of all Stopwords
    all_stopwords.remove('not')
    
    # Stemming
    ps = PorterStemmer()
    
    review = [ps.stem(word) for word in review if word not in all_stopwords]
    review = ' '.join(review)
    
    corpus.append(review)

In [21]:
df.replace({'negative':0, 'positive':1}, inplace=True)  # Encode Categorical Dependent Feature

In [22]:
new_df = pd.DataFrame(list(zip(corpus, df['airline_sentiment'])),
                      index = range(0, len(df['airline_sentiment'])),
                      columns=['Text', 'Sentiment'])

In [23]:
# <-- Save Cleaned Data -->

new_df.to_csv('Cleaned_Data.csv')

# Model Making

## Bag of Words

In [24]:
# <-- Convert text into numeric vector -->

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus)

In [25]:
with open('CountVectorizer.pkl','wb') as f:
    pickle.dump(cv, f)

In [26]:
X

<16267x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 128317 stored elements in Compressed Sparse Row format>

In [27]:
X = X.toarray()

In [28]:
X  # Dependent Feature

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
Y = df['airline_sentiment']  # Independent Feature

### Train Test Split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.80, test_size=0.20, random_state=42)

## Machine Learning Models

In [70]:
# All Model Details

model_names = list()
acc = list()
tp = list()
tn = list()
fp = list()
fn = list()
f1 = list()
preci = list()
recal = list()

### Radnom Forest Classifier

In [32]:
rf_classifier = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=0) 
rf_classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=10, random_state=0)

In [33]:
# <<-- Hyper Parameter Tuning -->>

parameters = [{'criterion':['entropy', 'gini'], 'min_samples_split':[2, 4, 8], 'n_estimators':[10, 15, 20, 25, 30]}]

grid_search = GridSearchCV(estimator = rf_classifier,
                           param_grid = parameters,
                           scoring = 'accuracy', 
                           cv = 10,
                           n_jobs = -1, 
                           verbose=10)

grid_search = grid_search.fit(x_train, y_train)

best_accuracy = grid_search.best_score_   
best_parameters = grid_search.best_params_  

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6

Best Accuracy: 92.24 %
Best Parameters: {'criterion': 'entropy', 'min_samples_split': 2, 'n_estimators': 25}


In [71]:
# Make new Random Forest model with best parameters

rf_classifier = RandomForestClassifier(n_estimators=25, criterion='entropy', min_samples_split=2, random_state=0) 
rf_classifier.fit(x_train, y_train)
predicted_values = rf_classifier.predict(x_test)

In [72]:
# Evaluation of model

accuracy = accuracy_score(y_test, predicted_values)*100
cf = confusion_matrix(y_test, predicted_values)
report = classification_report(y_test, predicted_values)

print(f'Accuracy: {accuracy}%\n')
print(f'Confusion Matrix: \n{cf}\n')
print(report)

Accuracy: 92.77811923786109%

Confusion Matrix: 
[[1769   98]
 [ 137 1250]]

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1867
           1       0.93      0.90      0.91      1387

    accuracy                           0.93      3254
   macro avg       0.93      0.92      0.93      3254
weighted avg       0.93      0.93      0.93      3254



In [73]:
# Save Model

with open('Random_Forest_Model.pkl','wb') as f:
    pickle.dump(rf_classifier, f)

In [74]:
# Save Details

algorithm = 'Random Forest'
accuracy = round(accuracy_score(y_test, predicted_values), 3)
true_positive = cf[0][0]
true_negative = cf[1][1]
false_positive = cf[0][1]
false_negative = cf[1][0]
F1_score = round(f1_score(y_test, predicted_values), 3) 
Precision_score = round(precision_score(y_test, predicted_values), 3)
Recall_score = round(recall_score(y_test, predicted_values), 3)

model_names.append(algorithm)
acc.append(accuracy)
tp.append(true_positive)
tn.append(true_negative)
fp.append(false_positive)
fn.append(false_negative)
f1.append(F1_score)
preci.append(Precision_score)
recal.append(Recall_score)

### XGBoost Classifier

In [38]:
xgboost_classifier = XGBClassifier()
xgboost_classifier.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [40]:
# <<-- Hyper Parameter Tuning -->>

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 600, num = 20)]
max_depth = [int(x) for x in np.linspace(5, 20, num = 6)]

parameters = {'n_estimators': n_estimators,
               'max_depth': max_depth}

random_search = RandomizedSearchCV(estimator = xgboost_classifier,
                           param_distributions  = parameters,
                           scoring = 'accuracy', 
                           cv = 5,
                           n_jobs=-1,
                           verbose=10)

random_search = random_search.fit(x_train, y_train)

best_accuracy = random_search.best_score_   
best_parameters = random_search.best_params_  

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 64.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 80.7min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed: 117.4min remaining: 25.8min
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed: 138.1min remaining:  8.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 141.6min finished


Best Accuracy: 92.86 %
Best Parameters: {'n_estimators': 389, 'max_depth': 8}


In [75]:
# Make new XGBoost model with best parameters

xgboost_classifier = XGBClassifier(n_estimators=389, max_depth=8)
xgboost_classifier.fit(x_train, y_train)
predicted_values = xgboost_classifier.predict(x_test)

In [76]:
# Evaluation of model

accuracy = accuracy_score(y_test, predicted_values)*100
cf = confusion_matrix(y_test, predicted_values)
report = classification_report(y_test, predicted_values)

print(f'Accuracy: {accuracy}%\n')
print(f'Confusion Matrix: \n{cf}\n')
print(report)

Accuracy: 93.76152427781193%

Confusion Matrix: 
[[1780   87]
 [ 116 1271]]

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1867
           1       0.94      0.92      0.93      1387

    accuracy                           0.94      3254
   macro avg       0.94      0.93      0.94      3254
weighted avg       0.94      0.94      0.94      3254



In [77]:
# Save Model

with open('XGBoost_Model.pkl','wb') as f:
    pickle.dump(xgboost_classifier, f)

In [78]:
# Save Details

algorithm = 'XGBoost'
accuracy = round(accuracy_score(y_test, predicted_values), 3)
true_positive = cf[0][0]
true_negative = cf[1][1]
false_positive = cf[0][1]
false_negative = cf[1][0]
F1_score = round(f1_score(y_test, predicted_values), 3) 
Precision_score = round(precision_score(y_test, predicted_values), 3)
Recall_score = round(recall_score(y_test, predicted_values), 3)

model_names.append(algorithm)
acc.append(accuracy)
tp.append(true_positive)
tn.append(true_negative)
fp.append(false_positive)
fn.append(false_negative)
f1.append(F1_score)
preci.append(Precision_score)
recal.append(Recall_score)

### Light GBM

In [45]:
lightgbm_classifier = LGBMClassifier()
lightgbm_classifier.fit(x_train, y_train)

LGBMClassifier()

In [46]:
# <<-- Hyper Parameter Tuning -->>

n_estimators = [int(x) for x in np.linspace(start = 300, stop = 600, num = 25)]
max_depth = [int(x) for x in np.linspace(5, 20, num = 6)]

parameters = {'n_estimators': n_estimators,
               'max_depth': max_depth}

random_search = RandomizedSearchCV(estimator = lightgbm_classifier,
                           param_distributions  = parameters,
                           scoring = 'accuracy', 
                           cv = 5,
                           n_jobs=-1,
                           verbose=10)

random_search = random_search.fit(x_train, y_train)

best_accuracy = random_search.best_score_   
best_parameters = random_search.best_params_   

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:   39.6s remaining:    8.6s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:   41.4s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.7s finished


Best Accuracy: 92.73 %
Best Parameters: {'n_estimators': 475, 'max_depth': 14}


In [79]:
# Make new LGBM model with best parameters

lightgbm_classifier = LGBMClassifier(n_estimators=475, max_depth=14)
lightgbm_classifier.fit(x_train, y_train)
predicted_values = lightgbm_classifier.predict(x_test)



In [80]:
# Evaluation of model

accuracy = accuracy_score(y_test, predicted_values)*100
cf = confusion_matrix(y_test, predicted_values)
report = classification_report(y_test, predicted_values)

print(f'Accuracy: {accuracy}%\n')
print(f'Confusion Matrix: \n{cf}\n')
print(report)

Accuracy: 93.4542102028273%

Confusion Matrix: 
[[1774   93]
 [ 120 1267]]

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1867
           1       0.93      0.91      0.92      1387

    accuracy                           0.93      3254
   macro avg       0.93      0.93      0.93      3254
weighted avg       0.93      0.93      0.93      3254



In [81]:
# Save Model

with open('LGBM_Model.pkl','wb') as f:
    pickle.dump(lightgbm_classifier, f)

In [82]:
# Save Details

algorithm = 'LGBM'
accuracy = round(accuracy_score(y_test, predicted_values), 3)
true_positive = cf[0][0]
true_negative = cf[1][1]
false_positive = cf[0][1]
false_negative = cf[1][0]
F1_score = round(f1_score(y_test, predicted_values), 3) 
Precision_score = round(precision_score(y_test, predicted_values), 3)
Recall_score = round(recall_score(y_test, predicted_values), 3)

model_names.append(algorithm)
acc.append(accuracy)
tp.append(true_positive)
tn.append(true_negative)
fp.append(false_positive)
fn.append(false_negative)
f1.append(F1_score)
preci.append(Precision_score)
recal.append(Recall_score)

## Deep Learning Model

### ANN Classifier

In [53]:
class MyHyperModel(HyperModel):

    def __init__(self, num_classes):
        self.num_classes = num_classes

    def build(self, hp):
        model = keras.Sequential()
        for i in range(hp.Int('num_layers', 2, 20)):
            model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                                min_value=32,
                                                max_value=512,
                                                step=32),
                                   activation='relu'))
        model.add(layers.Dense(self.num_classes, activation='sigmoid'))
        model.compile(
                    optimizer=keras.optimizers.Adam(
                          hp.Choice('learning_rate',
                          values=[1e-2, 1e-3, 1e-4])),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
        return model


hypermodel = MyHyperModel(num_classes=1)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,
    directory='my_dir',
    project_name='helloworld')

tuner.search(x_train, y_train,
             epochs=100,
             validation_data=(x_test, y_test))

Trial 10 Complete [00h 06m 40s]
val_accuracy: 0.9342347979545593

Best val_accuracy So Far: 0.9440688490867615
Total elapsed time: 01h 07m 29s
INFO:tensorflow:Oracle triggered exit


In [54]:
model = tuner.get_best_models(num_models=2)[0] # Best model







In [55]:
tuner.results_summary()  # Summary of the tuner that experimented deifferent models

Results summary
Results in my_dir\helloworld
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
num_layers: 16
units_0: 384
units_1: 256
learning_rate: 0.001
units_2: 64
units_3: 32
units_4: 224
units_5: 256
units_6: 96
units_7: 224
units_8: 32
units_9: 448
units_10: 352
units_11: 288
units_12: 384
units_13: 448
units_14: 416
units_15: 32
Score: 0.9440688490867615
Trial summary
Hyperparameters:
num_layers: 8
units_0: 160
units_1: 160
learning_rate: 0.001
units_2: 384
units_3: 192
units_4: 160
units_5: 160
units_6: 64
units_7: 192
units_8: 480
units_9: 384
units_10: 320
units_11: 352
units_12: 352
units_13: 192
units_14: 416
units_15: 416
Score: 0.9431468844413757
Trial summary
Hyperparameters:
num_layers: 2
units_0: 192
units_1: 416
learning_rate: 0.0001
units_2: 352
units_3: 288
units_4: 288
units_5: 288
units_6: 352
units_7: 96
units_8: 64
units_9: 352
units_10: 128
units_11: 352
units_12: 352
units_13: 384
units_14: 128
units_15: 32

In [56]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x1a6b802ba88>

In [57]:
# Evaluate the model on testing data

loss, accuracy = model.evaluate(x_test, y_test)



In [58]:
model.metrics_names

['loss', 'accuracy']

In [59]:
model.summary()  # Best model summary

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 384)               576384    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               98560     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 224)               7392      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               57600     
_________________________________________________________________
dense_6 (Dense)              (None, 96)                2

In [83]:
# Preidict values

predicted_values = model.predict(x_test)

In [84]:
thresh = 0.5  # Threshold

predicted_values = [1 if i[0] > thresh else 0 for i in predicted_values]

In [85]:
# Evaluation of model

accuracy = accuracy_score(y_test, predicted_values)*100
cf = confusion_matrix(y_test, predicted_values)
report = classification_report(y_test, predicted_values)

print(f'Accuracy: {accuracy}%\n')
print(f'Confusion Matrix: \n{cf}\n')
print(report)

Accuracy: 94.40688383527966%

Confusion Matrix: 
[[1777   90]
 [  92 1295]]

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1867
           1       0.94      0.93      0.93      1387

    accuracy                           0.94      3254
   macro avg       0.94      0.94      0.94      3254
weighted avg       0.94      0.94      0.94      3254



In [86]:
# Save Details

algorithm = 'ANN'
accuracy = round(accuracy_score(y_test, predicted_values), 3)
true_positive = cf[0][0]
true_negative = cf[1][1]
false_positive = cf[0][1]
false_negative = cf[1][0]
F1_score = round(f1_score(y_test, predicted_values), 3) 
Precision_score = round(precision_score(y_test, predicted_values), 3)
Recall_score = round(recall_score(y_test, predicted_values), 3)

model_names.append(algorithm)
acc.append(accuracy)
tp.append(true_positive)
tn.append(true_negative)
fp.append(false_positive)
fn.append(false_negative)
f1.append(F1_score)
preci.append(Precision_score)
recal.append(Recall_score)

In [65]:
# <<-- Save Model in HDF5 file format -->>

model.save("ANN_Model.h5")

In [66]:
# <<-- Load the saved model -->>

loaded_model = load_model("ANN_Model.h5") # Load the model
loss, accuracy = loaded_model.evaluate(x_test, y_test) # Test the accuracy on test data



## Save all Model Details

In [87]:
all_models_summary_df = pd.DataFrame(list(zip(model_names, acc, tp, tn, fp, fn, f1, preci, recal)),
            index = range(1, 5, 1), 
            columns=['Model', 'Accuracy', 'TP', 'TN', 'FP', 'FN', 'f1', 'precision', 'recall'])
all_models_summary_df

Unnamed: 0,Model,Accuracy,TP,TN,FP,FN,f1,precision,recall
1,Random Forest,0.928,1769,1250,98,137,0.914,0.927,0.901
2,XGBoost,0.938,1780,1271,87,116,0.926,0.936,0.916
3,LGBM,0.935,1774,1267,93,120,0.922,0.932,0.913
4,ANN,0.944,1777,1295,90,92,0.934,0.935,0.934


In [88]:
all_models_summary_df.to_csv('Summary.csv')  # Save data to csv file

## Single Prediction (Using OOPS Concept)

In [90]:
class SentimentAnalysis:
    def __init__(self, text):
        self.review = text
        self.copy_review = self.review

        self.emoji_pattern = re.compile("["
                                        u"\U0001F600-\U0001F64F"  # emoticons
                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                        "]+", flags=re.UNICODE)

        self.patterns = ['(http:\/\/[a-z0-9]+\.[a-z]+\/[a-zA-Z0-9#]+\/?)',  # Links
                         '@[A-za-z]+',  # Mail
                         '[^A-Za-z]',  # Punctuation
                         ]

    def clean_text(self):
        for patt in self.patterns:
            self.review = re.sub(pattern=patt,
                                 repl=' ',
                                 string=self.review)

        self.review = self.review.lower()  # Lower case
        self.review = self.review.split()  # List of each words

        # StopWords
        all_stopwords = set(stopwords.words('english'))  # Set of all Stopwords
        all_stopwords.remove('not')

        # Stemming
        ps = PorterStemmer()

        self.review = [ps.stem(word) for word in self.review if word not in all_stopwords]
        self.review = ' '.join(self.review)

    def bag_of_words(self):
        new_corpus = [self.review]

        with open('CountVectorizer.pkl', 'rb') as f:
            new_cv = pickle.load(f)

        self.review = new_cv.transform(new_corpus).toarray()

    def predict(self):
        loaded_model = load_model("ANN_Model.h5")  # Load the model
        prediction = loaded_model.predict(self.review)

        thresh = 0.5  # Threshold
        prediction = [1 if i[0] > thresh else 0 for i in prediction]

        text = self.copy_review

        if prediction[0] == 1:
            sentiment = 'Positive'
        else:
            sentiment = 'Negative'

        # Model Summary
        stringlist = []
        loaded_model.summary(print_fn=lambda x: stringlist.append(x))
        model_summary = " ".join(stringlist)

        data = {'Text': text,
                'Sentiment': sentiment,
                'Model': 'ANN',
                'Model_Summary': model_summary
                }

        json_object = json.dumps(data, indent=4)

        return sentiment, json_object

In [91]:
obj = SentimentAnalysis('It was a good experience')
obj.clean_text()
obj.bag_of_words()
obj.predict()

('Positive',

In [94]:
obj = SentimentAnalysis('It was a awful experience')
obj.clean_text()
obj.bag_of_words()
obj.predict()



('Negative',