In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load your data into a pandas DataFrame
data = pd.read_csv('resource1.csv')

# Combine 'NICName' and 'IndiaStates' columns into a single text column
data['Text'] = data['NICName'] + ' ' + data['IndiaStates']

# Convert text data into numerical vectors using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(data['Text'])

# Apply PCA to reduce dimensionality of the TF-IDF vectors
pca = PCA(n_components=1)  # You can adjust the number of components as needed
X_pca = pca.fit_transform(X_tfidf.toarray())

# Apply K-means clustering to group similar businesses based on PCA-transformed data
num_clusters = 5  # You can adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)

# Get cluster assignments for each data point
data['Cluster'] = kmeans.labels_

# Evaluate the clustering performance using silhouette score
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
print(f'Silhouette Score: {silhouette_avg}')

# Display the top terms (features) for each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()  # Use get_feature_names_out() instead of get_feature_names()
for i in range(num_clusters):
    print(f"Cluster {i+1} Top Terms:", [terms[ind] for ind in order_centroids[i, :5]])

# You can further analyze the clusters and interpret the results based on the business categories.

# Example: Print samples from each cluster
for i in range(num_clusters):
    print(f"\nCluster {i+1} Samples:")
    print(data[data['Cluster'] == i].head())




Silhouette Score: 0.7139639912138425
Cluster 1 Top Terms: ['accessories']
Cluster 2 Top Terms: ['accessories']
Cluster 3 Top Terms: ['accessories']
Cluster 4 Top Terms: ['accessories']
Cluster 5 Top Terms: ['accessories']

Cluster 1 Samples:
            IndiaStates                                            NICName  \
1  STATE - NCT OF DELHI                     Growing of non-perennial crops   
2  STATE - NCT OF DELHI  Growing of cereals (except rice), leguminous c...   
3  STATE - NCT OF DELHI                                    Growing of rice   
4  STATE - NCT OF DELHI  Growing of vegetables and melons, roots and tu...   
5  STATE - NCT OF DELHI                             Growing of fibre crops   

   MainWorkersTotalPersons  MainWorkersTotalMales  MainWorkersTotalFemales  \
1                      169                    151                       18   
2                       83                     81                        2   
3                        9                      9      

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load your data into a pandas DataFrame
data = pd.read_csv('resource1.csv')

# Combine 'NICName' and 'IndiaStates' columns into a single text column
data['Text'] = data['NICName'] + ' ' + data['IndiaStates']

# Convert text data into numerical vectors using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(data['Text'])

# Apply PCA to reduce dimensionality of the TF-IDF vectors
pca = PCA(n_components=1)  # You can adjust the number of components as needed
X_pca = pca.fit_transform(X_tfidf.toarray())

# Apply K-means clustering to group similar businesses based on PCA-transformed data
num_clusters = 5  # You can adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)

# Get cluster assignments for each data point
data['Cluster'] = kmeans.labels_

# Evaluate the clustering performance using silhouette score
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
print(f'Silhouette Score: {silhouette_avg}')

# Display the top terms (features) for each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()  # Use get_feature_names_out() instead of get_feature_names()
for i in range(num_clusters):
    print(f"Cluster {i+1} Top Terms:", [terms[ind] for ind in order_centroids[i, :5]])

# Example: Print samples from each cluster
for i in range(num_clusters):
    print(f"\nCluster {i+1} Samples:")
    print(data[data['Cluster'] == i][['MainWorkersTotalPersons', 'Cluster']].head())




Silhouette Score: 0.7138715406463305
Cluster 1 Top Terms: ['accessories']
Cluster 2 Top Terms: ['accessories']
Cluster 3 Top Terms: ['accessories']
Cluster 4 Top Terms: ['accessories']
Cluster 5 Top Terms: ['accessories']

Cluster 1 Samples:
     MainWorkersTotalPersons  Cluster
317                    18378        0
325                       74        0
326                       74        0
327                    13130        0
328                     1265        0

Cluster 2 Samples:
     MainWorkersTotalPersons  Cluster
355                     1102        1
356                      251        1
357                      851        1
358                    18936        1
359                    18738        1

Cluster 3 Samples:
    MainWorkersTotalPersons  Cluster
0                      1484        2
22                      443        2
23                      391        2
24                        6        2
25                       46        2

Cluster 4 Samples:
   MainWorkersTotalP

# New Section

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

# Load your data into a pandas DataFrame
data = pd.read_csv('resource1.csv')

# Combine 'NICName' and 'IndiaStates' columns into a single text column
data['Text'] = data['NICName'] + ' ' + data['IndiaStates']

# Convert text data into numerical vectors using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(data['Text'])

# Apply PCA to reduce dimensionality of the TF-IDF vectors
pca = PCA(n_components=1)  # You can adjust the number of components as needed
X_pca = pca.fit_transform(X_tfidf.toarray())

# Apply MiniBatchKMeans clustering to group similar businesses based on PCA-transformed data
num_clusters = 5  # You can adjust the number of clusters as needed
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)

# Get cluster assignments for each data point
data['Cluster'] = kmeans.labels_

# Evaluate the clustering performance using silhouette score
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
print(f'Silhouette Score: {silhouette_avg}')

# Display the top terms (features) for each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()  # Use get_feature_names_out() instead of get_feature_names()
for i in range(num_clusters):
    print(f"Cluster {i+1} Top Terms:", [terms[ind] for ind in order_centroids[i, :5]])

# Example: Print samples from each cluster
for i in range(num_clusters):
    print(f"\nCluster {i+1} Test Samples:")
    test_samples = data.iloc[np.where(kmeans.predict(X_pca) == i)]
    print(test_samples.head())




Silhouette Score: 0.7104488689103636
Cluster 1 Top Terms: ['accessories']
Cluster 2 Top Terms: ['accessories']
Cluster 3 Top Terms: ['accessories']
Cluster 4 Top Terms: ['accessories']
Cluster 5 Top Terms: ['accessories']

Cluster 1 Test Samples:
             IndiaStates                                            NICName  \
0   STATE - NCT OF DELHI  Crop and animal production, hunting and relate...   
22  STATE - NCT OF DELHI  Support activities to agriculture and post-har...   
23  STATE - NCT OF DELHI             Support activities for crop production   
24  STATE - NCT OF DELHI           Support activities for animal production   
25  STATE - NCT OF DELHI                       Post-harvest crop activities   

    MainWorkersTotalPersons  MainWorkersTotalMales  MainWorkersTotalFemales  \
0                      1484                   1271                      213   
22                      443                    398                       45   
23                      391              