Feature Preprocess

In [1]:
import os
import tempfile
import re
from collections import Counter
from datetime import datetime
import numpy as np
import pandas as pd
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [2]:
DATA_DIR = "/Users/kezia/Documents/UniMelb/SML/flight_delay_prediction/src/data"                     # where your CSVs live; change if needed
LISTINGS_CSV = "listings.csv"
REVIEWS_CSV  = "reviews.csv"
NEIGH_CSV    = "neighbourhoods.csv"   # optional
CALENDAR_CSV  = "calendar.csv"        # optional


In [3]:
# Tuning parameter
TFIDF_MAX_FEATURES = 20000     # reduce to e.g. 10000 or 5000 if memory constrained
SVD_COMPONENTS = 100           # 50,100,200 are common choices
TOP_AMENITIES = 30
GEO_CLUSTERS = 10
CHUNKSIZE = None               # None => read full reviews at once; or set e.g. 200000 for streaming mode


In [None]:
# Helpers Function
def get_writable_outdir(preferred=None):
    if preferred is None:
        preferred = [
            os.path.join(os.getcwd(), "processed_out"),
            os.path.expanduser("~/processed_out"),
            "/mnt/data",
            tempfile.gettempdir(),
            os.getcwd()
        ]
    for p in preferred:
        try:
            p = os.path.expanduser(p)
            if not os.path.isabs(p):
                p = os.path.abspath(p)
            os.makedirs(p, exist_ok=True)
            if os.access(p, os.W_OK):
                return p
        except Exception:
            continue
    # last resort
    fallback = tempfile.gettempdir()
    return fallback

OUT_DIR = get_writable_outdir()
PIPE_DIR = os.path.join(OUT_DIR, "pipeline_joblib")
os.makedirs(PIPE_DIR, exist_ok=True)
print("Saving outputs to:", OUT_DIR)

def parse_price(x):
    if pd.isna(x):
        return np.nan
    s = str(x).replace(",", "").strip()
    s = re.sub(r"[^\d\.-]", "", s)
    try:
        return float(s)
    except:
        return np.nan

def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s)
    s = re.sub(r"<.*?>", " ", s)          # strip HTML tags
    s = re.sub(r"http\S+", " ", s)        # strip URLs
    s = re.sub(r"\s+", " ", s)            # collapse whitespace
    return s.strip()

def parse_amenities_field(s):
    if pd.isna(s):
        return []
    s = str(s).strip()
    s = s.strip("{}[]")
    parts = [p.strip().strip('"').strip("'") for p in re.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', s) if p.strip()]
    return [p for p in parts if p]


Saving outputs to: /Users/kezia/Documents/UniMelb/SML/processed_out


In [None]:
# Read listings.csv (with dtype hints and parse_dates)
listings_path = os.path.join(DATA_DIR, LISTINGS_CSV)
print(listings_path)
if not os.path.exists(listings_path):
    raise FileNotFoundError(f"Could not find {listings_path} - put listings.csv in {DATA_DIR} or change DATA_DIR.")


/Users/kezia/Documents/UniMelb/SML/flight_delay_prediction/src/data/listings.csv


In [None]:
# dtype suggestions to save memory
dtype_map = {
    "id": "Int64",
    "host_id": "Int64",
    "accommodates": "Int64",
    "beds": "float32",
    "bedrooms": "float32",
    "host_listings_count": "Int64",
    "number_of_reviews": "Int64",
    # add others; use "Int64" instead of "int64" for columns that may have NA
}
# parse dates that exist
date_cols = []
for c in ["host_since", "first_review", "last_review", "calendar_last_scraped"]:
    date_cols.append(c) if c in pd.read_csv(listings_path, nrows=0).columns else None

print("Reading listings.csv ...")

listings = pd.read_csv(listings_path, dtype=dtype_map, parse_dates=[c for c in date_cols if c], low_memory=False)
print("Listings shape:", listings.shape)

# unify id column name for merges
if "id" in listings.columns:
    listings = listings.rename(columns={"id":"listing_id"})

# basic conversions
if "price" in listings.columns:
    listings["price_clean"] = listings["price"].map(parse_price)
else:
    listings["price_clean"] = np.nan

# parse amenities list into python list
if "amenities" in listings.columns:
    listings["amenity_list"] = listings["amenities"].apply(parse_amenities_field)
else:
    listings["amenity_list"] = [[] for _ in range(len(listings))]


Reading listings.csv ...
Listings shape: (25801, 79)


In [None]:
# Precompute listing-level small table used for merging into reviews
listing_small_cols = [
    "listing_id", "price_clean", "accommodates", "beds", "bedrooms", "bathrooms",
    "host_listings_count", "number_of_reviews", "reviews_per_month",
    "review_scores_rating", "latitude", "longitude", "host_is_superhost", "first_review", "last_review"
]
listing_small_cols = [c for c in listing_small_cols if c in listings.columns]
listings_small = listings[listing_small_cols].copy().reset_index(drop=True)
print("listings_small columns:", listings_small.columns.tolist())

# amenities top-K across all listings
all_amen = Counter()
for lst in listings_small.index:
    # use original listings df mapping
    amen = listings.loc[listings["listing_id"] == listings_small.loc[lst, "listing_id"], "amenity_list"]
    if len(amen):
        all_amen.update(amen.iloc[0])
top_amenities = [a for a,_ in all_amen.most_common(TOP_AMENITIES)]
print("Top amenities:", top_amenities)

# create amenity binaries per listing and merge into listings_small
for amen in top_amenities:
    listings_small[f"amenity__{amen}"] = listings["amenity_list"].apply(lambda L: 1 if amen in L else 0).values[:len(listings_small)]

# host_is_superhost to binary
if "host_is_superhost" in listings_small.columns:
    listings_small["host_is_superhost_bin"] = listings_small["host_is_superhost"].map({"t":1,"f":0}).fillna(0).astype(int)
else:
    listings_small["host_is_superhost_bin"] = 0


listings_small columns: ['listing_id', 'price_clean', 'accommodates', 'beds', 'bedrooms', 'bathrooms', 'host_listings_count', 'number_of_reviews', 'reviews_per_month', 'review_scores_rating', 'latitude', 'longitude', 'host_is_superhost', 'first_review', 'last_review']
Top amenities: ['Kitchen', 'Smoke alarm', 'Wifi', 'Hangers', 'Hot water', 'Essentials', 'Hair dryer', 'Iron', 'Dishes and silverware', 'TV', 'Washer', 'Shampoo', 'Cooking basics', 'Microwave', 'Bed linens', 'Air conditioning', 'Refrigerator', 'Heating', 'Hot water kettle', 'Dishwasher', 'Toaster', 'Free parking on premises', 'Self check-in', 'Long term stays allowed', 'Dining table', 'Shower gel', 'Dedicated workspace', 'Oven', 'Wine glasses', 'Cleaning products']


In [None]:
# Read reviews.csv
reviews_path = os.path.join(DATA_DIR, REVIEWS_CSV)
if not os.path.exists(reviews_path):
    raise FileNotFoundError(f"Could not find {reviews_path} - put reviews.csv in {DATA_DIR} or change DATA_DIR.")

print("Reading reviews.csv ... (CHUNKSIZE=%s)" % str(CHUNKSIZE))
if CHUNKSIZE is None:
    # read whole file (fastest if it fits in memory)
    reviews = pd.read_csv(reviews_path, parse_dates=["date"], low_memory=False)
    print("Reviews shape:", reviews.shape)
else:
    # chunked read example (if file is too big)
    print("Using chunked read. Building reviews dataframe from chunks ...")
    chunks = []
    for chunk in pd.read_csv(reviews_path, parse_dates=["date"], chunksize=CHUNKSIZE):
        chunks.append(chunk)
    reviews = pd.concat(chunks, ignore_index=True)
    print("Reviews shape (after concat):", reviews.shape)

# merge listing small table into reviews on listing_id
reviews = reviews.merge(listings_small, left_on="listing_id", right_on="listing_id", how="left", suffixes=("","_lst"))
print("Merged reviews+listings shape:", reviews.shape)

# drop reviews with missing target (listing-level review_scores_rating)
if "review_scores_rating" not in reviews.columns:
    raise RuntimeError("review_scores_rating not found in listings_small. Ensure listing file contains ratings.")
reviews = reviews[~reviews["review_scores_rating"].isna()].reset_index(drop=True)
print("After dropping missing target, rows:", len(reviews))


Reading reviews.csv ... (CHUNKSIZE=None)
Reviews shape: (940190, 6)
Merged reviews+listings shape: (940190, 51)
After dropping missing target, rows: 940190


In [None]:
# Data & labeling: target = listing-level numeric rating (regression)
y = reviews["review_scores_rating"].astype(float).values


In [None]:
# Text cleaning + token stats
print("Cleaning text and computing text stats ...")
reviews["comments_clean"] = reviews["comments"].apply(clean_text)

def text_stats_series(s):
    words = s.split()
    wc = len(words)
    char_count = len(s)
    avg_word_len = (sum(len(w) for w in words) / wc) if wc>0 else 0.0
    exclam = s.count("!")
    question = s.count("?")
    uppercase_words = sum(1 for w in words if any(c.isupper() for c in w))
    uppercase_ratio = uppercase_words / wc if wc>0 else 0.0
    stopwords = sum(1 for w in words if w.lower() in ENGLISH_STOP_WORDS)
    stopword_ratio = stopwords / wc if wc>0 else 0.0
    return pd.Series({
        "word_count": wc,
        "char_count": char_count,
        "avg_word_len": avg_word_len,
        "exclam_count": exclam,
        "question_count": question,
        "uppercase_ratio": uppercase_ratio,
        "stopword_ratio": stopword_ratio
    })

text_stats_df = reviews["comments_clean"].apply(text_stats_series)
reviews = pd.concat([reviews.reset_index(drop=True), text_stats_df.reset_index(drop=True)], axis=1)


Cleaning text and computing text stats ...


In [None]:
# Merge amenity binaries from listings_small
amen_cols = [f"amenity__{a}" for a in top_amenities if f"amenity__{a}" in listings_small.columns]
if amen_cols:
    reviews = reviews.merge(listings_small[["listing_id"] + amen_cols], on="listing_id", how="left", suffixes=("","_lst"))
    # fill NaN amen columns with 0
    for c in amen_cols:
        reviews[c] = reviews[c].fillna(0).astype(int)


In [None]:
# Numeric features, price parsing and price_per_person
print("Constructing numeric features ...")
if "price_clean" not in reviews.columns and "price_clean" in listings_small.columns:
    reviews = reviews.merge(listings_small[["listing_id","price_clean"]], on="listing_id", how="left")

numeric_cols = []
for c in ["price_clean", "accommodates", "beds", "bedrooms", "bathrooms", "host_listings_count", "number_of_reviews", "reviews_per_month"]:
    if c in reviews.columns:
        reviews[f"{c}_num"] = pd.to_numeric(reviews[c], errors="coerce")
        numeric_cols.append(f"{c}_num")

if "price_clean" in reviews.columns and "accommodates" in reviews.columns:
    reviews["price_per_person"] = reviews["price_clean"] / reviews["accommodates"].replace(0, pd.NA)
    numeric_cols.append("price_per_person")

# host superhost binary
if "host_is_superhost" in reviews.columns:
    reviews["host_is_superhost_bin"] = reviews["host_is_superhost"].map({"t":1,"f":0}).fillna(0).astype(int)
    numeric_cols.append("host_is_superhost_bin")
else:
    reviews["host_is_superhost_bin"] = 0
    numeric_cols.append("host_is_superhost_bin")

# temporal features from review date
reviews["review_date"] = pd.to_datetime(reviews["date"], errors="coerce")
max_date = reviews["review_date"].max()
reviews["review_age_days"] = (max_date - reviews["review_date"]).dt.days
reviews["review_month"] = reviews["review_date"].dt.month
reviews["review_weekday"] = reviews["review_date"].dt.weekday
reviews["review_year"] = reviews["review_date"].dt.year
numeric_cols += ["review_age_days", "review_month", "review_weekday", "review_year"]


Constructing numeric features ...


In [None]:
# Geo clustering
if "latitude" in listings.columns and "longitude" in listings.columns and listings[["latitude","longitude"]].dropna().shape[0] >= GEO_CLUSTERS:
    coords = listings[["latitude","longitude"]].dropna()
    km = KMeans(n_clusters=GEO_CLUSTERS, random_state=42)
    km.fit(coords)
    # map listing_id -> cluster
    listings["geo_cluster"] = -1
    listings.loc[coords.index, "geo_cluster"] = km.labels_
    # merge into reviews
    reviews = reviews.merge(listings[["listing_id","geo_cluster"]], on="listing_id", how="left")
    # one-hot encode
    geo_dummies = pd.get_dummies(reviews["geo_cluster"].fillna(-1).astype(int), prefix="geo_cluster")
    reviews = pd.concat([reviews.reset_index(drop=True), geo_dummies.reset_index(drop=True)], axis=1)
    geo_cols = [c for c in geo_dummies.columns]
else:
    geo_cols = []


In [None]:
# TF-IDF + TruncatedSVD
print("Building TF-IDF (max_features=%d) ..." % TFIDF_MAX_FEATURES)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=TFIDF_MAX_FEATURES, stop_words="english", min_df=2)
# fit on all comments
tfidf_matrix = tfidf.fit_transform(reviews["comments_clean"].fillna("").values)
print("TF-IDF shape:", tfidf_matrix.shape)

svd_cols = []
tfidf_summary_cols = []
if tfidf_matrix.shape[1] >= 2 and SVD_COMPONENTS and SVD_COMPONENTS > 0:
    n_components = min(SVD_COMPONENTS, tfidf_matrix.shape[1]-1)
    print("Applying TruncatedSVD with n_components=", n_components)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    svd_feats = svd.fit_transform(tfidf_matrix)
    svd_cols = [f"svd_text_{i}" for i in range(svd_feats.shape[1])]
    svd_df = pd.DataFrame(svd_feats, columns=svd_cols, index=reviews.index)
    reviews = pd.concat([reviews.reset_index(drop=True), svd_df.reset_index(drop=True)], axis=1)
else:
    # fallback: aggregated tfidf row stats
    print("TF-IDF vocabulary small; creating tfidf summary stats instead of SVD.")
    row_sums = np.asarray(tfidf_matrix.sum(axis=1)).ravel()
    row_means = np.asarray(tfidf_matrix.mean(axis=1)).ravel()
    try:
        row_max = np.asarray(tfidf_matrix.max(axis=1).toarray()).ravel()
    except Exception:
        row_max = np.zeros_like(row_sums)
    row_nnz = np.diff(tfidf_matrix.tocsr().indptr)
    reviews["tfidf_sum"] = row_sums
    reviews["tfidf_mean"] = row_means
    reviews["tfidf_max"] = row_max
    reviews["tfidf_nnz"] = row_nnz
    tfidf_summary_cols = ["tfidf_sum","tfidf_mean","tfidf_max","tfidf_nnz"]
    svd = None


Building TF-IDF (max_features=20000) ...
TF-IDF shape: (940190, 20000)
Applying TruncatedSVD with n_components= 100


In [None]:
# Assemble final feature list
text_stat_cols = ["word_count","char_count","avg_word_len","exclam_count","question_count","uppercase_ratio","stopword_ratio"]
amen_cols = [f"amenity__{a}" for a in top_amenities if f"amenity__{a}" in reviews.columns]
num_cols = [c for c in numeric_cols if c in reviews.columns]
svd_cols = [c for c in reviews.columns if c.startswith("svd_text_")]
tfidf_summary_cols = [c for c in tfidf_summary_cols if c in reviews.columns]

feature_cols = []
for group in (text_stat_cols, amen_cols, geo_cols, num_cols, svd_cols, tfidf_summary_cols):
    feature_cols += [c for c in group if c in reviews.columns]

# drop duplicates if any
feature_cols = list(dict.fromkeys(feature_cols))
print("Number of engineered features:", len(feature_cols))

X = reviews[feature_cols].copy()
# coerce all to numeric, create missingness flags 
X = X.apply(pd.to_numeric, errors="coerce")


Number of engineered features: 161


In [None]:
# Impute & scale
print("Imputing & scaling features ...")
all_nan_cols = X.columns[X.isna().all()].tolist()
if all_nan_cols:
    print("Columns with all NaN (will be re-added as zeros):", all_nan_cols)
X_drop = X.drop(columns=all_nan_cols)
imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(imputer.fit_transform(X_drop), columns=X_drop.columns, index=X_drop.index)

scaler = StandardScaler()
X_scaled_arr = scaler.fit_transform(X_imputed)
X_scaled = pd.DataFrame(X_scaled_arr, columns=X_imputed.columns, index=X_imputed.index)

# re-add all-NaN cols as zeros
for c in all_nan_cols:
    X_scaled[c] = 0.0

# reorder to original
X_scaled = X_scaled[feature_cols]


Imputing & scaling features ...


In [None]:
# Save outputs & pipeline objects
np.savez_compressed(os.path.join(OUT_DIR, "processed_features.npz"), X=X_scaled.values, y=y, feature_cols=np.array(X_scaled.columns))
joblib.dump(tfidf, os.path.join(PIPE_DIR, "tfidf.joblib"))
if 'svd' in locals() and svd is not None:
    joblib.dump(svd, os.path.join(PIPE_DIR, "svd.joblib"))
joblib.dump(imputer, os.path.join(PIPE_DIR, "imputer.joblib"))
joblib.dump(scaler, os.path.join(PIPE_DIR, "scaler.joblib"))
joblib.dump(feature_cols, os.path.join(PIPE_DIR, "feature_cols.joblib"))

print("Saved processed_features.npz and pipeline artifacts to:", OUT_DIR)
print("Final shapes: X:", X_scaled.shape, " y:", y.shape)


Saved processed_features.npz and pipeline artifacts to: /Users/kezia/Documents/UniMelb/SML/processed_out
Final shapes: X: (940190, 161)  y: (940190,)


In [None]:
# Check the output data
# SAhape checks
print("listings shape:", listings.shape)
print("reviews shape:", reviews.shape)
print("merged reviews rows:", len(reviews))
print("Feature matrix shape:", X_scaled.shape)   # X_scaled produced by preprocessing
print("Target shape:", y.shape)

# Feature names & preview
print("\nNumber of engineered features:", len(X_scaled.columns))
print("First 30 feature names:", list(X_scaled.columns)[:30])
display(X_scaled.head(10))   # Jupyter-friendly

# Target distribution and stats
import numpy as np
print("\nTarget (review_scores_rating) stats:")
print("min, median, mean, max:", np.min(y), np.median(y), np.mean(y), np.max(y))
try:
    import pandas as pd
    print(pd.Series(y).describe())
except:
    pass

# Sample raw and cleaned text
print("\nSample raw comments and cleaned text:")
for i,row in reviews[["comments","comments_clean"]].dropna().head(8).iterrows():
    print("---")
    print("RAW:", row["comments"][:250])
    print("CLEAN:", row["comments_clean"][:250])

# TF-IDF / SVD quick summary
if 'tfidf' in globals():
    try:
        feat_names = tfidf.get_feature_names_out()
        print("\nTF-IDF vocab size:", len(feat_names))
        # top 15 terms by idf (rarest terms)
        idf = np.array(tfidf.idf_)
        top_idx = idf.argsort()[::-1][:15]
        top_terms = [(feat_names[i], idf[i]) for i in top_idx]
        print("Top 15 TF-IDF terms by IDF (rarest):", top_terms)
    except Exception as e:
        print("TF-IDF inspection failed:", e)

if 'svd' in globals() and svd is not None:
    try:
        print("SVD components shape:", svd.components_.shape)
        # explained variance if available
        if hasattr(svd, "explained_variance_ratio_"):
            print("SVD explained variance sum (approx):", svd.explained_variance_ratio_.sum())
    except Exception as e:
        print("SVD inspection failed:", e)

# Summary stats of engineered features
print("\nFeature statistics (first 8 cols):")
print(X_scaled.iloc[:, :8].describe().T)


listings shape: (25801, 82)
reviews shape: (940190, 214)
merged reviews rows: 940190
Feature matrix shape: (940190, 161)
Target shape: (940190,)

Number of engineered features: 161
First 30 feature names: ['word_count', 'char_count', 'avg_word_len', 'exclam_count', 'question_count', 'uppercase_ratio', 'stopword_ratio', 'amenity__Kitchen', 'amenity__Smoke alarm', 'amenity__Wifi', 'amenity__Hangers', 'amenity__Hot water', 'amenity__Essentials', 'amenity__Hair dryer', 'amenity__Iron', 'amenity__Dishes and silverware', 'amenity__TV', 'amenity__Washer', 'amenity__Shampoo', 'amenity__Cooking basics', 'amenity__Microwave', 'amenity__Bed linens', 'amenity__Air conditioning', 'amenity__Refrigerator', 'amenity__Heating', 'amenity__Hot water kettle', 'amenity__Dishwasher', 'amenity__Toaster', 'amenity__Free parking on premises', 'amenity__Self check-in']


Unnamed: 0,word_count,char_count,avg_word_len,exclam_count,question_count,uppercase_ratio,stopword_ratio,amenity__Kitchen,amenity__Smoke alarm,amenity__Wifi,...,svd_text_90,svd_text_91,svd_text_92,svd_text_93,svd_text_94,svd_text_95,svd_text_96,svd_text_97,svd_text_98,svd_text_99
0,0.36012,0.321937,-0.14336,0.505254,-0.046228,-0.069668,0.999518,0.277494,0.147235,-2.517791,...,-0.262361,0.389755,-0.173751,0.431762,1.120842,0.379482,1.670625,-0.218726,-0.014563,-0.624396
1,1.346454,1.066159,-0.215714,0.505254,-0.046228,-0.750644,0.400833,0.277494,0.147235,-2.517791,...,0.128746,0.159128,-0.191099,-0.141062,1.977687,-1.765708,1.746953,1.19352,1.512762,1.039548
2,0.186061,0.148984,-0.142385,0.505254,-0.046228,-0.470839,0.896388,0.277494,0.147235,-2.517791,...,-0.056136,-0.121517,0.278067,-0.07029,-0.119836,-0.157187,-0.098225,-0.181877,0.189352,-0.125347
3,-0.684235,-0.642407,0.037788,0.505254,-0.046228,-0.522782,-0.177683,0.277494,0.147235,-2.517791,...,-1.292225,-0.045886,1.724226,-0.39715,-0.774681,1.418677,-1.731105,-1.534692,-0.055488,-0.035129
4,0.418139,0.400552,-0.133411,3.614245,-0.046228,-0.117137,0.60581,0.277494,0.147235,-2.517791,...,-0.685737,-0.608923,-0.191403,0.125161,-0.597587,1.304392,-1.256887,0.775842,-0.572558,-0.341312
5,0.766258,0.793627,-0.118205,-0.531076,-0.046228,-0.592606,0.516467,0.277494,0.147235,-2.517791,...,-0.55149,-0.148464,-0.001332,-0.295961,0.669915,-0.945143,-0.851717,0.518326,-0.748017,-0.46654
6,0.186061,0.196153,-0.114901,0.505254,-0.046228,-0.08993,0.441973,0.277494,0.147235,-2.517791,...,0.409728,-1.343645,-0.426292,-0.41293,-0.247368,0.020862,-1.306192,-0.232358,0.507033,-0.377759
7,0.998336,0.940375,-0.150927,1.541585,-0.046228,0.012198,0.354164,0.277494,0.147235,-2.517791,...,1.256621,-0.915412,-0.476128,-0.355721,-0.517972,-0.892206,-0.465257,1.262762,0.213512,-0.070285
8,0.36012,0.311455,-0.148688,1.541585,-0.046228,-0.568093,-0.057561,0.277494,0.147235,-2.517791,...,-0.739884,-0.042524,-0.42296,-0.58537,0.304679,0.370399,-0.16223,0.796518,0.669663,0.025875
9,-0.336116,-0.422285,-0.190848,0.505254,-0.046228,-0.214139,-0.005856,0.277494,0.147235,-2.517791,...,0.10153,-1.006217,-1.483212,-0.681329,-0.440524,-0.967513,0.273685,1.682137,0.416893,-0.018243



Target (review_scores_rating) stats:
min, median, mean, max: 1.0 4.85 4.806347887129196 5.0
count    940190.000000
mean          4.806348
std           0.179606
min           1.000000
25%           4.740000
50%           4.850000
75%           4.930000
max           5.000000
dtype: float64

Sample raw comments and cleaned text:
---
RAW: It was very convenient  to stay in Lindsay's appartment in Brunswick for a couple of weeks, not only is Lindsay a great host giving you valuable tips the appartment also is in a very interesting part of Melbourne and the CBD is not far away. Highly r
CLEAN: It was very convenient to stay in Lindsay's appartment in Brunswick for a couple of weeks, not only is Lindsay a great host giving you valuable tips the appartment also is in a very interesting part of Melbourne and the CBD is not far away. Highly re
---
RAW: Perfect isnt enough! Lindsay was the best host we could ever imagine help us a lot to handle our new start in australia. His place have a perf