# Importing

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.probability import FreqDist

import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

print(f'Libraries imported')

Libraries imported


In [3]:
nltk.download('all', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

True

In [4]:
stop_words = set(stopwords.words('english'))
stop_word_additions = ['gym', 'club']
stop_words.update(stop_word_additions)

lemmatizer = WordNetLemmatizer()
tokenizer = word_tokenize

In [5]:
def extract(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    print(f"Loaded {file_path}: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

In [6]:
google_rename_mappings = {
            "Social Media Source": "source",
            "Club's Name": "location",
            "Creation Date": "date_created",
            "Comment": "review",
            "Overall Score": "score",
        }


trustpilot_rename_mappings = {
            "Source Of Review": "source",
            "Location Name": "location",
            "Review Created (UTC)": "date_created",
            "Review Content": "review",
            "Review Stars": "score",
        }

def rename_and_select(df, mappings):
    df = df.rename(columns=mappings)
    cols = list(mappings.values())
    return df[cols].copy()

google_path = '../data/raw/google_reviews.csv'
google_df = extract(google_path)
google_df = rename_and_select(google_df, google_rename_mappings)
google_df.head()

Loaded ../data/raw/google_reviews.csv: 1000 rows, 7 columns


Unnamed: 0,source,location,date_created,review,score
0,Google,Bellview Gym,2023-06-23,Can get crowded during peak hours. Staff are v...,5
1,Google,Adamsfurt Gym,2023-11-05,Great equipment and clean facilities. Classes ...,5
2,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3
3,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2
4,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1


In [7]:
schema = {
        "source": "string",
        "location": "string",
        "date_created": "datetime",
        "review": "string",
        "score": "int64",
    }


def cast_types(df, schema):
    for col, dtype in schema.items():
        if dtype == "datetime":
            df[col] = pd.to_datetime(df[col], errors="coerce")
        else:
            df[col] = df[col].astype(dtype, errors="ignore")
    return df

google_df = cast_types(google_df, schema)
print(google_df.dtypes)
google_df.head()

source                  string
location                string
date_created    datetime64[us]
review                  string
score                    int64
dtype: object


Unnamed: 0,source,location,date_created,review,score
0,Google,Bellview Gym,2023-06-23,Can get crowded during peak hours. Staff are v...,5
1,Google,Adamsfurt Gym,2023-11-05,Great equipment and clean facilities. Classes ...,5
2,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3
3,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2
4,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1


In [9]:
max_score = 3

def filter_rows(df, max_score):
    return df[df["score"] <= max_score].copy()

google_df = filter_rows(google_df, max_score)
print(f'Max: {google_df["score"].max()}, Min: {google_df["score"].min()}')
google_df.head()

Max: 3, Min: 1


Unnamed: 0,source,location,date_created,review,score
2,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3
3,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2
4,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1
7,Google,Powellview Gym,2024-01-24,Great equipment and clean facilities. Atmosphe...,2
8,Google,West Jorgemouth Gym,2023-06-23,Staff are very friendly and helpful. Good vari...,2


- All helper functions working

In [10]:
def transform(df, mappings, max_score):
    # 1. Rename + keep expected columns
    df = rename_and_select(df, mappings)

    # 2. Remove rows without reviews
    df = df.dropna(subset=["review"])

    # 3. Type casting
    df = cast_types(df, schema)

    # 4. Drop duplicates
    df = df.drop_duplicates()

    # 5. Rating filtering
    df = filter_rows(df, max_score)
    df = df[df['review'].notna()].copy()
    return df

In [11]:
google_df = extract(google_path)
transformed_google_df = transform(google_df, google_rename_mappings, max_score)
transformed_google_df.head()

Loaded ../data/raw/google_reviews.csv: 1000 rows, 7 columns


Unnamed: 0,source,location,date_created,review,score
2,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3
3,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2
4,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1
7,Google,Powellview Gym,2024-01-24,Great equipment and clean facilities. Atmosphe...,2
8,Google,West Jorgemouth Gym,2023-06-23,Staff are very friendly and helpful. Good vari...,2


In [12]:
trustpilot_path = '../data/raw/trustpilot_reviews.csv'
trustpilot_df = extract(trustpilot_path)
transformed_trustpilot_df = transform(trustpilot_df, trustpilot_rename_mappings, max_score)
transformed_trustpilot_df.head()

Loaded ../data/raw/trustpilot_reviews.csv: 1000 rows, 15 columns


Unnamed: 0,source,location,date_created,review,score
0,Trustpilot,New Daniel Gym,2023-07-17,Atmosphere is motivating and energetic. Easy t...,1
4,Trustpilot,Morsestad Gym,2023-06-21,Classes are fun and motivating. Can get crowde...,1
5,Trustpilot,Lake Adammouth Gym,2023-09-18,Great equipment and clean facilities. Good var...,1
6,Trustpilot,East Angela Gym,2024-03-16,Good variety of machines and weights. Classes ...,1
7,Trustpilot,Madelineville Gym,2023-07-12,Great equipment and clean facilities. Classes ...,2


In [13]:
def combine_datasets(dfs):
    combined = pd.concat(dfs, ignore_index=True)
    return combined

final_df = combine_datasets([transformed_google_df, transformed_google_df])
final_df.head()

Unnamed: 0,source,location,date_created,review,score
0,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3
1,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2
2,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1
3,Google,Powellview Gym,2024-01-24,Great equipment and clean facilities. Atmosphe...,2
4,Google,West Jorgemouth Gym,2023-06-23,Staff are very friendly and helpful. Good vari...,2


In [14]:
final_df['source'].unique()

<StringArray>
['Google']
Length: 1, dtype: string

In [15]:
final_df['review'].isna().sum()

np.int64(0)

# Feature engineering

In [16]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [17]:
def tokenize_text(text):
    tokens = tokenizer(text)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [18]:
def lemmatize_tokens(tokens):
    # convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

    return lemmas

In [19]:
def text_processing(df, column_name):
    df['clean_review'] = df[column_name].apply(clean_text)
    df['clean_review'] = df['clean_review'].apply(tokenize_text)
    df['clean_review'] = df['clean_review'].apply(lemmatize_tokens)
    return df

In [20]:
final_df_clean = text_processing(final_df, 'review')
final_df_clean.head()

Unnamed: 0,source,location,date_created,review,score,clean_review
0,Google,West Becky Gym,2022-12-01,Can get crowded during peak hours. Staff are v...,3,"[get, crowd, peak, hour, staff, friendly, help..."
1,Google,Susanview Gym,2024-03-26,Great equipment and clean facilities. Locker r...,2,"[great, equipment, clean, facility, locker, ro..."
2,Google,Lake Jerry Gym,2023-08-06,Parking is limited but manageable. Great equip...,1,"[parking, limited, manageable, great, equipmen..."
3,Google,Powellview Gym,2024-01-24,Great equipment and clean facilities. Atmosphe...,2,"[great, equipment, clean, facility, atmosphere..."
4,Google,West Jorgemouth Gym,2023-06-23,Staff are very friendly and helpful. Good vari...,2,"[staff, friendly, helpful, good, variety, mach..."
