# **Installing BERTopic**

We start by installing BERTopic from PyPi:

Installation can take a while

In [1]:
%%capture
!pip install bertopic

### Restart the Notebook

(Mandatory)


After installing BERTopic, some packages that were already loaded were updated and in order to correctly use them, we should now restart the notebook.

From the Menu:

Runtime → Restart Runtime

In this notebook, we will further apply BERTopic on individual company wise

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
df = pd.read_csv("/content/BBMP_All_Ward_Spends.csv", encoding='utf-8')

In [3]:
print(df.shape)
df.columns

(49520, 7)


Index(['Sl No', 'Name of Work', 'Job Number', 'Contractor', 'Gross', 'Nett',
       'Deduction'],
      dtype='object')

In [4]:
df_work = df[df['Name of Work'].notnull()]

**BSNL**

In [5]:
docs = df_work["Name of Work"].tolist()

In [6]:
from bertopic import BERTopic

In [7]:
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/1548 [00:00<?, ?it/s]

2023-04-12 20:24:35,607 - BERTopic - Transformed documents to Embeddings
2023-04-12 20:25:52,187 - BERTopic - Reduced dimensionality
2023-04-12 20:25:58,857 - BERTopic - Clustered reduced embeddings


In [8]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,15550,-1_asphalting_improvements_drain_roads
1,0,849,0_concreting_basavanapura_ashalting_53
2,1,679,1_cement_concrete_laying_conservency
3,2,630,2_cc_52_drain_krpura
4,3,538,3_desilting_kammanahalli_tertiary_28


In [9]:
df_results = pd.DataFrame({"Document": docs, "Topic": topics})

In [10]:
df_results

Unnamed: 0,Document,Topic
0,IMPROVEMENTS TO DRAINS IN KUMARA SWAMY LAYOUT ...,-1
1,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,98
2,Construction of Helpline centre at Anganavadi ...,33
3,"Desilting of drain at Illiyas nagar, Kanaka na...",3
4,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,98
...,...,...
49515,Improvements to drains and Providing Covering ...,-1
49516,Providing 250 watts S V fittings and control s...,117
49517,Improvements to drain and footpath to 80ft roa...,9
49518,Improvements to drain with covering slab and d...,-1


In [11]:
df_results['Topic'].nunique()

924

In [12]:
topic_model.visualize_barchart(top_n_topics=6)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3))
topic_model.update_topics(docs, topics, vectorizer_model=vectorizer_model)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
new_topics = topic_model.reduce_topics(docs, nr_topics=15)

2023-04-12 20:27:14,920 - BERTopic - Reduced number of topics from 924 to 15


In [15]:
new_topics = topic_model.topics_

In [16]:
df_up = pd.DataFrame({"Document": docs, "Topic": new_topics})
df_up['Topic'].value_counts()

 0     25812
-1     15550
 1      3467
 2      1510
 3      1473
 4       415
 5       306
 7       237
 6       237
 8       214
 9       120
 10       90
 11       58
 12       18
 13       13
Name: Topic, dtype: int64

In [17]:
df_up.head(4)

Unnamed: 0,Document,Topic
0,IMPROVEMENTS TO DRAINS IN KUMARA SWAMY LAYOUT ...,-1
1,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,5
2,Construction of Helpline centre at Anganavadi ...,0
3,"Desilting of drain at Illiyas nagar, Kanaka na...",3


In [18]:
df_work.head(4)

Unnamed: 0,Sl No,Name of Work,Job Number,Contractor,Gross,Nett,Deduction
0,10957,IMPROVEMENTS TO DRAINS IN KUMARA SWAMY LAYOUT ...,185-14-000009,B N Suresh9900197571,1014029.0,877159.0,136870.0
1,10955,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,185-12-000028,Suresh B N 9900197571,511898.0,442880.0,69018.0
2,11019,Construction of Helpline centre at Anganavadi ...,185-14-000001,K P Raveen GPA Holder is Shravani Construction...,929250.0,796705.0,132545.0
3,13230,"Desilting of drain at Illiyas nagar, Kanaka na...",185-13-000019,M Manjunatha9845096084,895720.0,772865.0,122855.0


In [22]:
pd.concat([df_work, df_up['Topic']], axis=1)

Unnamed: 0,Sl No,Name of Work,Job Number,Contractor,Gross,Nett,Deduction,Topic
0,10957,IMPROVEMENTS TO DRAINS IN KUMARA SWAMY LAYOUT ...,185-14-000009,B N Suresh9900197571,1014029.0,877159.0,136870.0,-1
1,10955,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,185-12-000028,Suresh B N 9900197571,511898.0,442880.0,69018.0,5
2,11019,Construction of Helpline centre at Anganavadi ...,185-14-000001,K P Raveen GPA Holder is Shravani Construction...,929250.0,796705.0,132545.0,0
3,13230,"Desilting of drain at Illiyas nagar, Kanaka na...",185-13-000019,M Manjunatha9845096084,895720.0,772865.0,122855.0,3
4,15340,RESERVE FUND FOR EMERGENCY WORKS IN WARD NO 18...,185-14-000014,B N Suresh9900197571,630138.0,552075.0,78063.0,5
...,...,...,...,...,...,...,...,...
49515,159,Improvements to drains and Providing Covering ...,183-13-000016,Harish Constructions,1494190.0,1268110.0,226080.0,-1
49516,160,Providing 250 watts S V fittings and control s...,183-13-000026,M/s.KRIDL,654427.0,556011.0,98416.0,2
49517,161,Improvements to drain and footpath to 80ft roa...,183-13-000022,TECHNICAL MANAGER KRIDL,625519.0,539170.0,86349.0,0
49518,162,Improvements to drain with covering slab and d...,183-13-000020,TECHNICAL MANAGER (3) KRIDL,9399305.0,7708835.0,1690470.0,-1


In [23]:
pd.concat([df_work, df_up['Topic']], axis=1).to_excel("BBMP_All_Wards_Work_Categorized.xlsx")

In [24]:
import json

res = topic_model.get_topics()
result = json.dumps(res)

In [25]:
for k, v in res.items():
    print(k, v)

-1 [('ward', 0.03399715014985235), ('improvements', 0.024119780992333528), ('road', 0.024104107782033687), ('roads', 0.022476244424037953), ('cross', 0.020290004673625173), ('providing', 0.018951149158282395), ('drains', 0.01776539756616101), ('layout', 0.017294975526455363), ('main', 0.016836975550981254), ('drain', 0.016492039034356837)]
0 [('ward', 0.03534954709636164), ('road', 0.02046852486062275), ('improvements', 0.020449549027548558), ('roads', 0.019108921383153846), ('cross', 0.016432237928766608), ('drains', 0.015423966338483726), ('maintenance', 0.015014957058918133), ('providing', 0.014501248483786637), ('main', 0.014172483130752278), ('construction', 0.01315255889237663)]
1 [('water', 0.05557355377284024), ('drilling', 0.03855473140259867), ('borewell', 0.03845707447686412), ('borewells', 0.03553994136488137), ('ward', 0.033989811519502096), ('supply', 0.03392333603615068), ('providing', 0.03341191727974658), ('water supply', 0.031170246005610268), ('pipeline', 0.023925646

In [None]:
print(json.dumps(res, indent=4, sort_keys=True))

In [None]:
import pprint

# Prints the nicely formatted dictionary
pprint.pprint(result)

In [21]:
topic_model.visualize_barchart(top_n_topics=6)