In [None]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peizhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/peizhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
cwd = os.getcwd()
print(cwd)

/Users/peizhi


In [None]:
d = pd.read_csv("Downloads/wiki.csv")
d.head()

Unnamed: 0,City,Text
0,Clearwater_Beach,"Beach in Pinellas County, Florida\n Sunset on ..."
1,Chicago,"Largest city in Illinois, United States\nFor t..."
2,New_York_City,"Most populous city in the United States\n""NYC""..."
3,Providenciales,Island and the largest city of Turks and Caico...
4,Los_Angeles,"Largest city in California, United States\nFor..."


In [None]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [None]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top","room","one","stop","minutes","nearby",
                                                     "listing","br","equipped","building","just","can","size",
                                                     "prior","welcome","ave","away","will","aside","except","month",
                                                     "home","bedroom","bathroom","two","three","apartment","unit",
                                                     "area","'ll","place","located","blocks","couple","plus",
                                                     "restaurants","shops","must","like","surrounded","daily",
                                                     "will","provide","without","within","spacebbr","bthe","nycbr",
                                                     "nyc","years","looking","information","give","makes","street",
                                                     "ny","city","quite","come","also","likebr","people","giving",
                                                     "five","behind","need","allow","space","next","world","stay",
                                                     "single","kitchen","guest","ride","min","citi","cit", "cinemat", 
                                                     "cinco", "chunk","circl","access","accessbbr","accomodation",
                                                     "accomod","air","amaz","footnote","citation","moreover",
                                                     "beside","on","also","including","if","article","then","likely",
                                                     "such","as","requires","Chinese", "youre", "present", "past", 
                                                     "Chinas", "China's", "India", "number", "Maria", "Although", 
                                                     "studies","solar","energy","hence","third","section","study",
                                                     "yesterday","comes","thank","earlier","came","doc","talk","tell",
                                                     "found","part","way","thats","therefore","energies","yet",
                                                     "morning","done","theyre","bri","suggests","lets","taking",
                                                     "believe","job", "think","know","much","said","however","second",
                                                     "thus","country", "china", "market","industry","United","States",
                                                     "chinese","countries","january","february","march","april","may",
                                                     "june","july","august","september","october","november","december",
                                                     "original","buildings","pp-","according","days","york","cities",
                                                     "among","since","known","united","states","became","average",
                                                     "name","state","public","see","now","around","built","region",
                                                     "los","retrieved","archived","population","area","needed",
                                                     "tourism","research","line","many","san","company","based","due",
                                                     "per","km","pp","-","held","john","annual","include","century",
                                                     "several","list","mayor","economy","culture","isbn","park",
                                                     "data","police","house","department","site","furthermore","ikeja",
                                                     "mwana","lic","sa","jse","productive","holbox"," xelha"," xcalak",
                                                    "xcaret","cancun","aire","noord","preferred","dehesa","iwo","vtegreater",
                                                     "ʻewa"," dania"," dpompano","pompano","designation","concurrently"," originates",
                                                    "jasper","aiea","wilton","alongside","derive","oyo","n3","referenced",
                                                     "toward","iwo","node","ulsan","aalborg"])

text_columns = ["City", "content"]

df_raw = pd.read_csv("Downloads/wiki.csv")
df = df_raw.copy()
df["content"] = df["Text"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (180, 2)
Pre-processed dataframe: (180, 2)


In [None]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [None]:
model.wv.most_similar("travel")

[('wikinewstravel', 0.7814754247665405),
 ('guide', 0.7768462896347046),
 ('wikivoyage', 0.774678647518158),
 ('fast', 0.769598662853241),
 ('related', 0.7688405513763428),
 ('wikimedia', 0.7605570554733276),
 ('external', 0.757722020149231),
 ('links', 0.757706880569458),
 ('boracay', 0.7478778958320618),
 ('commonstravel', 0.7459885478019714)]

In [None]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(180, 100)

In [None]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [None]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.12
Inertia:37.6976214532132
Silhouette values:
    Cluster 35: Size:2 | Avg:0.46 | Min:0.44 | Max: 0.48
    Cluster 44: Size:2 | Avg:0.44 | Min:0.41 | Max: 0.48
    Cluster 7: Size:8 | Avg:0.39 | Min:0.16 | Max: 0.49
    Cluster 46: Size:2 | Avg:0.34 | Min:0.32 | Max: 0.36
    Cluster 24: Size:7 | Avg:0.31 | Min:0.21 | Max: 0.42
    Cluster 5: Size:2 | Avg:0.27 | Min:0.19 | Max: 0.35
    Cluster 48: Size:2 | Avg:0.27 | Min:0.25 | Max: 0.29
    Cluster 41: Size:2 | Avg:0.26 | Min:0.24 | Max: 0.28
    Cluster 1: Size:4 | Avg:0.24 | Min:0.10 | Max: 0.33
    Cluster 20: Size:2 | Avg:0.22 | Min:0.12 | Max: 0.32
    Cluster 37: Size:4 | Avg:0.22 | Min:0.13 | Max: 0.30
    Cluster 2: Size:7 | Avg:0.22 | Min:-0.02 | Max: 0.34
    Cluster 22: Size:3 | Avg:0.20 | Min:0.11 | Max: 0.24
    Cluster 9: Size:3 | Avg:0.19 | Min:0.08 | Max: 0.26
    Cluster 36: Size:4 | Avg:0.18 | Min:0.15 | Max: 0.20
    Cluster 40: Size:4 | Avg:0.16 | Min:0.11 | Max: 0.20

In [None]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: contoy ōshima kifissia subprefecture emiliano 
Cluster 1: bounds flatlands greenpoint sevierville scottsdales 
Cluster 2: calgarys ngos albertas rust pusan 
Cluster 3: sø ueno brusselsinfo belgiums holmen 
Cluster 4: louisvilles northland dearborn counters cruiser 
Cluster 5: dania midwood miamis beachs miramar 
Cluster 6: catalogue flourish friedrichshain britains novgorod 
Cluster 7: hyderabads aucklands doesnt vietnams investor 
Cluster 8: antiguo cabos vila jalisco rozas 
Cluster 9: asserted mun gobierno ley flourish 
Cluster 10: cabos ōshima nicoya mule badagry 
Cluster 11: ngos merge calgarys gyle albertas 
Cluster 12: charlottes louisvilles ngos spectrumlocalnewscom gigiri 
Cluster 13: belgiums brusselsinfo johannesburgs accommodates aerodrome 
Cluster 14: buoy annex flatbush miamis flatlands 
Cluster 15: henares gobierno vanguardia bruxelloise ardoz 
Cluster 16: superseded johannesburgs vietnams belgiums cal

In [None]:
test_cluster = 29
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(docs[d])
    print("-------------")

Punta_Cana | Resort town in La Altagracia Province, Dominican Republic
Place in La Altagracia Province, Dominican RepublicPunta CanaCap Cana Marina area in Punta Cana, Dominican Republic

SealPunta CanaPunta Cana in the Dominican RepublicCoordinates: 18°32′N 068°22′W﻿ / ﻿18.533°N 68.367°W﻿ / 18.533; -68.367Coordinates: 18°32′N 068°22′W﻿ / ﻿18.533°N 68.367°W﻿ / 18.533; -68.367CountryDominican RepublicProvinceLa Altagracia ProvinceMunicipalityHigüeyIncorporated (town)27 June 2006[1]Government[2][3] • Mayor of HigüeyRafael Barón Duluc (Social Democratic Institutional Bloc, 2020–2024) • Director of Verón–Punta CanaRamón Antonio Ramírez (Dominican Liberation Party, 2016–2024)Area[4] • Total475.3 km2 (183.5 sq mi)Population (2010)[4] • Total43,982 • Density93/km2 (240/sq mi)Time zoneUTC−4 (Atlantic)Area code(s)809, 829, and 849
Punta Cana is a resort town in the easternmost region of the Dominican Republic. It is part of the Veron–Punta Cana municipal district, in the Higüey municipality of 