## Data Modeling
***

## Data Preparation

In [2]:
import os
import sys
# Path to the directory where your Jupyter Notebook is located
notebook_dir = '/Users/ronlodetti/Documents/Flatiron/capstone/airline_sentiment_analysis/hidden'

# Get the parent directory (where your `src` directory is located)
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, GRU, TextVectorization
from tensorflow.keras.models import Sequential
from scikeras.wrappers import KerasClassifier
from nltk.tokenize import RegexpTokenizer
from src import code
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('../data/Airline_review.csv')[['Review_Title', 'Review', 'Recommended']]
X = df['Review_Title'] + ' ' + df['Review']
y = df['Recommended'].map({'yes': 1, 'no': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

## Bag of Words Prep

In [4]:
tf_idf = TfidfVectorizer(
    decode_error='replace', 
    strip_accents='unicode', 
    stop_words=None, 
    ngram_range=(1, 2), 
    max_df=0.95, 
    min_df=2)
k_best = SelectKBest(k=20000)

def to_dense(x):
    """Convert a sparse matrix to a dense numpy array."""
    return np.asarray(x.todense())
    
to_dense_transformer = FunctionTransformer(to_dense, accept_sparse=True)

bow_pipe = Pipeline([
    ("tf_idf", tf_idf),
    ('feature_selection', k_best),
    ('to_dense',to_dense_transformer)])

tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")

X_train_bow = code.preprocess_texts(X_train, tokenizer, stop_words=None, lemmatize=True)
X_test_bow = code.preprocess_texts(X_test, tokenizer, stop_words=None, lemmatize=True)

In [6]:
# Cleaning text for sequence models
X_train_seq = code.preprocess_texts(X_train, tokenizer)
X_test_seq = code.preprocess_texts(X_test, tokenizer)

In [7]:
# Prep for RNN
text_vectorization = TextVectorization(
    standardize=None,
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200)

text_vectorization.adapt(X_train_seq)
X_train_seq_clean = text_vectorization(X_train_seq)
X_test_seq_clean = text_vectorization(X_test_seq)

In [8]:
# Prep for Glove
vocabulary = text_vectorization.get_vocabulary()
vocab_size = len(vocabulary) 

glove_embeddings = {}
with open('../data/glove.6B.300d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Initialize the embedding matrix with zeros.
embedding_matrix = np.zeros((vocab_size, 300))

# Populate the embedding matrix with GloVe vectors.
for i, word in enumerate(vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Data Modeling

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

text_vec_cv = TextVectorization(
    standardize=None,
    max_tokens=20000,
    output_mode='int',
    output_sequence_length=200)

CALLBACKS = [EarlyStopping(monitor='val_loss',
                           min_delta=0.001,
                           patience=5,
                           restore_best_weights=True,
                           verbose=0)]

### Baseline Models

#### Dummy Model

In [11]:
dummy_model = DummyClassifier(strategy='uniform', random_state=42)

code.bag_of_words_CV('Dummy', dummy_model, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
Dummy,0.499832,0.5


#### Baseline Model

In [12]:
baseline_model = LogisticRegression(max_iter=1000)

code.bag_of_words_CV('Baseline', baseline_model, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
Baseline,0.904762,0.961384


### Bag of Words Models

#### Logistic Regression

In [13]:
log_reg = LogisticRegression(C=9.42012179027564, 
                         max_iter= 100, 
                         solver= 'newton-cg')

code.bag_of_words_CV('Logistic_Regression', log_reg, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
Logistic_Regression,0.916079,0.967629


#### MultinomialNB

In [14]:
mnb_model = MultinomialNB(fit_prior= False,
                    class_prior= None,
                    alpha= 0.01)

code.bag_of_words_CV('MultinomialNB', mnb_model, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
MultinomialNB,0.887738,0.948371


#### Gradient Boosting Classifier

In [15]:
gbc_model = GradientBoostingClassifier(subsample=0.8,
                                 n_estimators=300,
                                 min_samples_split=2,
                                 max_features='sqrt',
                                 max_depth=6,
                                 learning_rate=0.1)

code.bag_of_words_CV('Gradient_Boosting_Classifier', gbc_model, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
Gradient_Boosting_Classifier,0.901932,0.958396


#### Random Forest Classifier

In [16]:
rfc_model = RandomForestClassifier(n_estimators=500,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='sqrt',
                             max_depth=None,
                             bootstrap=True)

code.bag_of_words_CV('Random_Forest_Classifier', rfc_model, bow_pipe, X_train_bow, y_train, cv=skf)

Unnamed: 0,Accuracy,AUC
Random_Forest_Classifier,0.888553,0.949225


#### Multi-Layer Perceptrons Model

In [17]:
def mlp_model_builder():
    model = Sequential([
        Dense(128, activation='relu',input_shape=(20000,)),
        Dropout(0.8),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])
    return model

mlp_model = KerasClassifier(model=mlp_model_builder, 
                           random_state=42,
                           batch_size=256,
                           verbose=0,
                           callbacks=CALLBACKS, 
                           validation_split=0.1,
                           shuffle=True,
                           epochs=100)

code.bag_of_words_CV('Multi-Layer_Perceptrons', mlp_model, bow_pipe, X_train_bow, y_train, cv=skf)

2024-04-08 13:59:37.840207: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_10' with dtype float and shape [15013,20000]
	 [[{{node Placeholder/_10}}]]
2024-04-08 13:59:37.842242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype float and shape [15013,1]
	 [[{{node Placeholder/_11}}]]
2024-04-08 13:59:41.381236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype float and

INFO:tensorflow:Assets written to: ram://be73bd3788c24ae18173573adf75bc17/assets


INFO:tensorflow:Assets written to: ram://be73bd3788c24ae18173573adf75bc17/assets


Unnamed: 0,Accuracy,AUC
Multi-Layer_Perceptrons,0.918908,0.968847


### Sequence Models

#### Recursive Neural Network

In [20]:
rnn_model = Sequential([
    Embedding(input_dim=20000, output_dim=32, input_length=200),
    Bidirectional(GRU(16)),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
    ])

rnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

rnn_model.fit(x=X_train_seq_clean,
              y=y_train,
              batch_size=256,
              epochs=100,
              verbose=0,
              validation_split=0.1,
              callbacks=CALLBACKS,
              shuffle=True)

rnn_model.save('rnn_model')

code.keras_cv('RNN',
              X_train,
              y_train,
              skf,
              tokenizer,
              text_vec_cv,
              CALLBACKS)

2024-04-08 14:10:04.849189: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-08 14:10:04.850888: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-08 14:10:04.852779: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

INFO:tensorflow:Assets written to: rnn_model/assets


INFO:tensorflow:Assets written to: rnn_model/assets
2024-04-08 14:11:36.007257: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-08 14:11:36.010109: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-08 14:11:36.012542: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you

Unnamed: 0,loss,val_loss,val_accuracy,val_auc
0,0.210897,0.273321,0.889512,0.949174


#### GloVe

In [22]:
glv_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=300, input_length=200, weights=[embedding_matrix], trainable=False),
    Bidirectional(GRU(32)),
    Dropout(0.4),
    Dense(16, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
    ])

glv_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'AUC'])

glv_model.fit(x=X_train_seq_clean,
              y=y_train,
              batch_size=64,
              epochs=100,
              verbose=0,
              callbacks=CALLBACKS,
              shuffle=True)

glv_model.save('glv_model')

code.keras_cv('GloVe',
              X_train,
              y_train,
              skf,
              tokenizer,
              text_vec_cv,
              CALLBACKS,
              glove=True, 
              glove_path = '../data/glove.6B.300d.txt')

2024-04-08 14:19:31.274229: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-08 14:19:31.276181: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-08 14:19:31.278361: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus





























































































































































KeyboardInterrupt: 