# Text Clustering using pre-defiend Cluster Words

This script uses Kmeans clustering with predefined clusters to categorize "Instrument", "Goal", and "Affected Group" of environmental protection policies, based on word appearance in provided policy abstracts.

Structure of the Script: 

1. Loading Data, preprocessing and Vectorizing the Texts

2. Instrument Clusters: Defining Words, Vectorizing, Defining Clusters, Saving

3. Goal Clusters: Defining Words, Vectorizing, Defining Clusters, Saving

4. Affected Group Clusters: Defining Words, Vectorizing, Defining Clusters, Saving

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Load data from Excel
df = pd.read_excel('env_protection.xlsx')

Data Preprocessing

1. Abstracts einlesen
2. Tokenization
3. Stopwords
4. Lemmatization

In [2]:
abstracts_list = df['Abstract_en'].astype(str).tolist()

In [3]:
#import packages for Tokenization, Stopword list and Lemmatization, 
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Tokenization and Normalization
tokenized = [word_tokenize(doc.lower()) for doc in abstracts_list]
#Download Stopword list
nltk.download('stopwords')
nltk.download('wordnet')
#Remove stopwords
stop_words = set(stopwords.words('english'))
#exclude additional words
custom_stop_words = {'shall', 'act', "law", "provides", "provision", "i", "vi", ',', '.', ')', 'art', '(', "'", ":", "1"}  # Add your custom stopwords here
stop_words.update(custom_stop_words)
stop_words=list(stop_words)

filtered = [[token for token in tokens if token not in stop_words] for tokens in tokenized]
#Convert each sublist into a string, we need this for lemmatization
filtered = [' '.join(sublist) for sublist in filtered]
#Lemmanization
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(token) for token in filtered]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#Import Package for Feature Extraction using the Tf IDF Method
from sklearn.feature_extraction.text import TfidfVectorizer
#Feature Extraction
vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.7, min_df=2)
X = vectorizer.fit_transform(words)

### Instruments

In [7]:
# List of specific words for initial centroids, with multiple words per cluster
instrument_seeds = [
    ['planning', "planners", "planner" ],  
    ["area", "protected", "areas"],
    ["regulating", "regulatory", "norms", "norm", "criteria", "council", "framework"],
    ["education", "educational", "information", "research", "monitoring", "data", "Impact Assessment"],
    ['tax', 'subsidies', 'financing', "finance", "fund", "subsidie"]  
]

# Find the indices of the initial words in the vocabulary and compute the mean of TF-IDF vectors for each group
initial_centroids = []
for group in instrument_seeds:
    indices = [vectorizer.vocabulary_.get(word, -1) for word in group]  # Get index or -1 if not found
    indices = [idx for idx in indices if idx != -1]  # Remove -1 (words not found in vocabulary)
    
    if not indices:
        raise ValueError(f"None of the words in the group {group} were found in the vocabulary.")
    
    # Compute mean of TF-IDF vectors for words in the group
    centroid_vector = np.mean(X[:, indices].toarray(), axis=0)  
    
    # Ensure centroid_vector has right shape by padding with zeros if necessary
    if len(centroid_vector) < X.shape[1]:
        padded_centroid = np.zeros(X.shape[1])
        padded_centroid[indices] = centroid_vector
        centroid_vector = padded_centroid
    
    initial_centroids.append(centroid_vector)

# Convert the list of centroids into a numpy array
initial_centroids = np.array(initial_centroids)

(5, 1479)


In [8]:
# Kmeans clustering with initial centroids
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=len(instrument_seeds), init=initial_centroids, n_init=1)
kmeans.fit(X)

Cluster-Labels: [1 0 3 0 4 4 4 3 0 4 3 1 3 1 1 4 3 4 2 4 1 2 0 1 1 0 3 3 2 0 1 2 0 0 3 1 3
 4 1 2 4 1 3 3 2 3 3 3 3 2 2 3 2 1 4 3 3 2 1 1 1 1 1 1 1 1 1 1 1 1 0 3 1 2
 4 2 1 2 4 4 3 1 3 3 3 3 3 3 1 3 3 3 4 0 3 2 2 2 3 1 2 0 0 3 2 2 2 2 2]


In [13]:
#Naming and saving the Instrument Classification
Cluster_Num = kmeans.labels_
df['Instrument'] = Cluster_Num
Cluster_Name = {0: "Planning", 
                1: "Protected Areas", 
                2: "Regulatory", 
                3: "Education_and_Research", 
                4: "Financial"}
df['Instrument_Name'] = df['Instrument'].map(Cluster_Name)
df.to_excel('env_protection.xlsx', index=False)

### Goal

In [15]:
# List of specific words for initial centroids, with multiple words per cluster
goal_seeds = [
    ['conservation', 'preservation', 'habitat', "restoration", "protected", "repair"],
    ['management', "rational use"],
    ["development", "agriculture", "productive" ], 
    ["education", "educational", "information", "research", "awareness"]
]

# Find the indices of the initial words in the vocabulary and compute the mean of TF-IDF vectors for each group
initial_centroids = []
for group in goal_seeds:
    indices = [vectorizer.vocabulary_.get(word, -1) for word in group]  # Get index or -1 if not found
    indices = [idx for idx in indices if idx != -1]  # Remove -1 (words not found in vocabulary)
    
    if not indices:
        raise ValueError(f"None of the words in the group {group} were found in the vocabulary.")
    
    # Compute mean of TF-IDF vectors for words in the group
    centroid_vector = np.mean(X[:, indices].toarray(), axis=0)  
    
    # Ensure centroid_vector has shape (1479,) by padding with zeros if necessary
    if len(centroid_vector) < X.shape[1]:
        padded_centroid = np.zeros(X.shape[1])
        padded_centroid[indices] = centroid_vector
        centroid_vector = padded_centroid
    
    initial_centroids.append(centroid_vector)

# Convert the list of centroids into a numpy array
initial_centroids = np.array(initial_centroids)

(4, 1479)


In [16]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=len(goal_seeds), init=initial_centroids, n_init=1)
kmeans.fit(X)

Cluster-Labels: [0 3 0 0 3 0 0 1 1 2 1 1 1 1 1 0 3 1 1 1 0 2 1 0 2 2 3 1 2 1 0 1 1 1 1 3 3
 1 0 2 1 3 0 0 1 1 1 0 1 2 2 3 1 1 0 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 1
 1 2 1 2 0 0 0 3 3 3 1 1 0 1 1 3 1 3 0 2 1 2 1 1 1 1 1 2 1 3 2 2 1 1 2]


In [18]:
#Naming and saving
Cluster_Num = kmeans.labels_
df['Goal'] = Cluster_Num
Cluster_Name = {0: "Conservation", 
                1: "Management", 
                2: "Development", 
                3: "Education_and_Research"}
df['Goal_Name'] = df['Goal'].map(Cluster_Name)
df.to_excel('env_protection.xlsx', index=False)

### Affected Groups

In [19]:
# List of specific words for initial centroids, with multiple words per cluster
affected_seeds = [
    ['private', 'individuals', 'citizens', "inhabitants", "community", "participation", "participatory", "individual"],
    ['indigenous'],
    ["authorities", "administration"],
    ["farmers", "agriculture", "livestock", "agricultural", "lease", "pasture", "pastures", "fishing" ],
    ["companies", "economy", "use","trade", "leasing", "lease", "juridical" ]
]

# Find the indices of the initial words in the vocabulary and compute the mean of TF-IDF vectors for each group
initial_centroids = []
for group in affected_seeds:
    indices = [vectorizer.vocabulary_.get(word, -1) for word in group]  # Get index or -1 if not found
    indices = [idx for idx in indices if idx != -1]  # Remove -1 (words not found in vocabulary)
    
    if not indices:
        raise ValueError(f"None of the words in the group {group} were found in the vocabulary.")
    
    # Compute mean of TF-IDF vectors for words in the group
    centroid_vector = np.mean(X[:, indices].toarray(), axis=0)  
    
    # Ensure centroid_vector has shape (1479,) by padding with zeros if necessary
    if len(centroid_vector) < X.shape[1]:
        padded_centroid = np.zeros(X.shape[1])
        padded_centroid[indices] = centroid_vector
        centroid_vector = padded_centroid
    
    initial_centroids.append(centroid_vector)

# Convert the list of centroids into a numpy array
initial_centroids = np.array(initial_centroids)

(5, 1479)


In [20]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=len(affected_seeds), init=initial_centroids, n_init=1)
kmeans.fit(X)

Cluster-Labels: [4 4 0 0 0 4 4 2 1 1 0 4 0 4 1 0 4 0 2 1 4 2 4 4 4 4 1 0 0 0 4 2 4 1 1 4 0
 4 1 1 1 1 0 1 2 0 3 0 4 2 2 0 2 1 4 0 0 0 4 4 4 4 4 4 4 4 4 4 4 4 1 4 2 2
 0 1 2 2 2 4 4 4 1 0 0 0 0 0 4 1 1 1 4 1 0 2 2 2 4 1 2 4 0 0 2 2 2 2 2]


In [22]:
#Naming and saving
Cluster_Num = kmeans.labels_
df['Affected_Group'] = Cluster_Num
Cluster_Name = {0: "Citizens", 
                1: "Indigenous", 
                2: "Authorities and administration", 
                3: "Farmers",
               4: "Companies"}
df['Affected_Group_Name'] = df['Affected_Group'].map(Cluster_Name)
df.to_excel('env_protection.xlsx', index=False)