## Pre-Experiment 5: Implementation of MSDS 458 Assignment 3 Code (RNN, LSTM, and 1D-CNN Models) for Text Classifiction Applied to the Golf Course Reviews Dataset

**i. Import the necessary libraries for the experiment**

In [1]:
# Import necesary libraries
import datetime
from packaging import version
from collections import Counter
import numpy as np
import pandas as pd
import time
import os
import re
import string

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
# %pip install tensorflow_datasets
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as k

In [2]:
# Set plots to appear inline
%matplotlib inline
# Set the default precision for numpy
np.set_printoptions(precision=3, suppress=True)

# Enable display of multiple outputs per Jupyter Notebook cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**ii. Load in the golf course reviews dataset**

In [16]:
# Import the dataset
file_path = "top_and_non_golf_course_reviews.csv"
df = pd.read_csv(file_path)

**1. Examine the Dataset**

In [17]:
# Examine the dataset
df.head()

Unnamed: 0,review_id,course_name,label,location,architect,year_built,review_title,review_author,file_name,review_text
0,1,Pine Valley,top100,"Pine Valley, NJ",George Crump / Harry S. Colt,1918,PINE VALLEY GOLF CLUB - 19 POINTS,David Jones,rev01_pine_valley_1,There’s not much point trying to do a hole-by-...
1,2,Pine Valley,top100,"Pine Valley, NJ",George Crump / Harry S. Colt,1918,"Pine Valley Golf Club (Clementon, New Jersey)",Bill Satterfield,rev02_pine_valley_2,What to Expect: Pine Valley is the finest gol...
2,3,Cypress Point,top100,"Pebble Beach, CA",Alister MacKenzie,1928,CYPRESS POINT REVIEW,Graylyn Loomis,rev03_cypress_point_1,“No one but a poet should be allowed to write ...
3,4,Cypress Point,top100,"Pebble Beach, CA",Alister MacKenzie,1928,"Cypress Point Golf Club (Pebble Beach, Califor...",Bill Satterfield,rev04_cypress_point_2,What to Expect: I don't even feel worthy to w...
4,5,Shinnecock Hills,top100,"Southampton, NY",William Flynn,1931,Review: Shinnecock Hills Golf Club,Andrew Harvie,rev05_shinnecock_1,"There’s not many courses as acclaimed, sought ..."


In [18]:
# Create a new label column that indicates whether the review is a top100 course or not
df["top100"] = df["label"].apply(lambda x: 1 if x == "top100" else 0)

# Examine the dataset
df.head()

Unnamed: 0,review_id,course_name,label,location,architect,year_built,review_title,review_author,file_name,review_text,top100
0,1,Pine Valley,top100,"Pine Valley, NJ",George Crump / Harry S. Colt,1918,PINE VALLEY GOLF CLUB - 19 POINTS,David Jones,rev01_pine_valley_1,There’s not much point trying to do a hole-by-...,1
1,2,Pine Valley,top100,"Pine Valley, NJ",George Crump / Harry S. Colt,1918,"Pine Valley Golf Club (Clementon, New Jersey)",Bill Satterfield,rev02_pine_valley_2,What to Expect: Pine Valley is the finest gol...,1
2,3,Cypress Point,top100,"Pebble Beach, CA",Alister MacKenzie,1928,CYPRESS POINT REVIEW,Graylyn Loomis,rev03_cypress_point_1,“No one but a poet should be allowed to write ...,1
3,4,Cypress Point,top100,"Pebble Beach, CA",Alister MacKenzie,1928,"Cypress Point Golf Club (Pebble Beach, Califor...",Bill Satterfield,rev04_cypress_point_2,What to Expect: I don't even feel worthy to w...,1
4,5,Shinnecock Hills,top100,"Southampton, NY",William Flynn,1931,Review: Shinnecock Hills Golf Club,Andrew Harvie,rev05_shinnecock_1,"There’s not many courses as acclaimed, sought ...",1


In [19]:

import tensorflow as tf
import tensorflow_text as tf_text

# Define stopwords
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words("english"))

def custom_stopwords(input_text):
    # Convert to lowercase
    lower_text = tf_text.case_fold_utf8(input_text)
    
    # Tokenize the text
    words = tf_text.words_and_offsets(lower_text)
    
    # Remove punctuation and filter stop words
    words = tf.strings.regex_replace(words.words, r'[^\w\s]', '')
    mask = tf.reduce_all(tf.not_equal(tf.expand_dims(words, -1), tf.constant(list(STOPWORDS))), axis=-1)
    filtered_words = tf.boolean_mask(words, mask)
    
    # Join the words back into a string
    return tf.strings.reduce_join(filtered_words, separator=' ', axis=-1)

# Apply the custom_stopwords function to the DataFrame
df['cleaned_review_text'] = df['review_text'].apply(lambda x: custom_stopwords(tf.constant(x)).numpy().decode())

# Create and adapt TextVectorization layer
max_tokens = None
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    standardize=custom_stopwords
)

# Convert the pandas Series to a TensorFlow dataset
text_dataset = tf.data.Dataset.from_tensor_slices(df['cleaned_review_text'].values)

# Adapt the TextVectorization layer
text_vectorization.adapt(text_dataset)

ModuleNotFoundError: No module named 'tensorflow_text'

**2. Create training, validation, and test datasets**

In [12]:
# Split the data into 70% training, 15% validation, and 15% testing
from sklearn.model_selection import train_test_split

# First split: 70% training, the remaining 30% for validation and testing
train_df, remaining = train_test_split(df, test_size=0.3, stratify=df['top100'], random_state=42)

# Second split: 50% of the remaining data for validation, the other 50% for testing
val_df, test_df = train_test_split(remaining, test_size=0.5, stratify=remaining['top100'], random_state=42)

**3. Preprocess the textual data**

In [24]:
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words("english"))

# Define a function to clean the text and remove stopwords
# def custom_stopwords(input_text):
#     lowercase = input_text.lower()
#     stripped_punct = tf.strings.regex_replace(lowercase
#                                   ,'[%s]' % re.escape(string.punctuation)
#                                   ,'')
#     return tf.strings.regex_replace(stripped_punct, r'\b(' + r'|'.join(STOPWORDS) + r')\b\s*',"")

def custom_stopwords(input_text):
    lowercase = input_text.lower()
    stripped_punct = re.sub(f'[{re.escape(string.punctuation)}]', '', lowercase)
    return ' '.join([word for word in stripped_punct.split() if word not in STOPWORDS])


# Apply the preprocessing function to the 'review_text' column of the df DataFrame
df['cleaned_review_text'] = df['review_text'].apply(custom_stopwords)

# Display the first few rows from the DataFrame
df[['review_text', 'cleaned_review_text']].head()


True

Unnamed: 0,review_text,cleaned_review_text
0,There’s not much point trying to do a hole-by-...,there’s much point trying holebyhole guide pin...
1,What to Expect: Pine Valley is the finest gol...,expect pine valley finest golf course planet h...
2,“No one but a poet should be allowed to write ...,“no one poet allowed write beauties cypress po...
3,What to Expect: I don't even feel worthy to w...,expect dont even feel worthy write review cypr...
4,"There’s not many courses as acclaimed, sought ...",there’s many courses acclaimed sought document...


In [32]:
# Convert the input text to a string before applying the lower() method
def custom_stopwords(input_text):
    lowercase = tf.strings.as_string(input_text).lower()
    stripped_punct = re.sub(f'[{re.escape(string.punctuation)}]', '', lowercase)
    return ' '.join([word for word in stripped_punct.split() if word not in STOPWORDS])

# Convert the input text to a string before applying the lower() method
df['review_text'] = df['review_text'].apply(lambda x: str(x))

# Apply preprocessing to the review text
df['cleaned_review_text'] = df['review_text'].apply(custom_stopwords)

# Create and adapt TextVectorization layer
max_tokens = None
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    standardize=custom_stopwords)

# Adapt the TextVectorization layer with the updated custom_stopwords function
text_vectorization.adapt(df['cleaned_review_text'])

AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'lower'