In [135]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import os 
import re

In [136]:
CWD = os.getcwd()

PROJ_DIR_PATH = os.path.split(CWD)[0]

DATASET_PATH = os.path.join(PROJ_DIR_PATH, 'combined_dataset.csv')


In [137]:
dengue_df = pd.read_csv(DATASET_PATH)

dengue_df.head()

Unnamed: 0.1,Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num
0,0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,170320,3
1,1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,170320,3
2,0,2,bunga rampai place,1.338931,103.883537,1,1,32,150925,9
3,1,9,joo seng road (block 8),1.335219,103.878805,1,1,32,150925,9
4,2,3,mount vernon road,1.340377,103.87949,1,1,32,150925,9


## DATA CLEANING

In [138]:
#unnecessary column

dengue_df.drop(columns=['Unnamed: 0'], inplace=True)
dengue_df.head()

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,170320,3
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,170320,3
2,2,bunga rampai place,1.338931,103.883537,1,1,32,150925,9
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,150925,9
4,3,mount vernon road,1.340377,103.87949,1,1,32,150925,9


In [139]:
dengue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56976 entries, 0 to 56975
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   num_cases                56976 non-null  int64  
 1   street_address           56976 non-null  object 
 2   latitude                 56976 non-null  float64
 3   longitude                56976 non-null  float64
 4   cluster_num              56976 non-null  int64  
 5   recent_cases_in_cluster  56976 non-null  int64  
 6   total_cases_in_cluster   56976 non-null  int64  
 7   date                     56976 non-null  int64  
 8   month_num                56976 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 3.9+ MB


In [140]:
#converting date to datetime 

dengue_df['date'] = pd.to_datetime(dengue_df['date'], format='%y%m%d').dt.strftime('%d/%m/%Y')

dengue_df

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9
4,3,mount vernon road,1.340377,103.879490,1,1,32,25/09/2015,9
...,...,...,...,...,...,...,...,...,...
56971,1,chai chee street (block 43),1.328380,103.925500,3,2,3,15/05/2017,5
56972,2,chai chee street (block 45),1.328806,103.924377,3,2,3,15/05/2017,5
56973,2,kang ching road (block 339d),1.338952,103.722240,4,2,2,15/05/2017,5
56974,1,lorong 4 toa payoh (block 60),1.336470,103.850664,5,2,2,15/05/2017,5


In [141]:
#converting columns to their appropriate data types

dengue_df['street_address'] = dengue_df['street_address'].str.lower().str.strip()
dengue_df['num_cases'] = dengue_df['num_cases'].astype(int)
dengue_df['latitude'] = dengue_df['latitude'].astype(float)
dengue_df['longitude'] = dengue_df['longitude'].astype(float)

dengue_df

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9
4,3,mount vernon road,1.340377,103.879490,1,1,32,25/09/2015,9
...,...,...,...,...,...,...,...,...,...
56971,1,chai chee street (block 43),1.328380,103.925500,3,2,3,15/05/2017,5
56972,2,chai chee street (block 45),1.328806,103.924377,3,2,3,15/05/2017,5
56973,2,kang ching road (block 339d),1.338952,103.722240,4,2,2,15/05/2017,5
56974,1,lorong 4 toa payoh (block 60),1.336470,103.850664,5,2,2,15/05/2017,5


In [142]:
#check for missing entries 

missing_dengue = dengue_df.isna().sum()

missing_dengue


num_cases                  0
street_address             0
latitude                   0
longitude                  0
cluster_num                0
recent_cases_in_cluster    0
total_cases_in_cluster     0
date                       0
month_num                  0
dtype: int64

### Creating a new Column for the Year

In [143]:
years = []

for row in dengue_df.itertuples():
    datelist = row.date.split('/')
    years.append(datelist[-1])

dengue_df['year'] = years
dengue_df.head()

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3,2017
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3,2017
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9,2015
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9,2015
4,3,mount vernon road,1.340377,103.87949,1,1,32,25/09/2015,9,2015


### Separating into Regions

In [144]:
# #splitting into regions
# region_mapping = {
#     'East': ['bedok', 'changi', 'pasir ris', 'tampines', 'simei', 'kew', 'geylang east', 'bunga rampai', 'tanjong rhu', 'chai chee', 'siglap', 'eunos', 'tanah merah', 'marine drive', 'paya lebar', 'jalan tenaga', 'changi bay', 'telok kurau', 'joo chiat', 'limau', 'marine terrace', 'meyer', 'duku'],

#     'North-East': ['ang mo kio', 'hougang', 'punggol', 'sengkang', 'serangoon', 'vernon', 'fernvale', 'compassvale', 'seletar', 'sembilan', 'edgefield', 'rivervale', 'anchorvale', 'yio chu kang', 'buangkok'],

#     'Central': ['marina bay', 'raffles', 'outram', 'sentosa', 'orchard', 'river valley', 'newton', 'novena', 'bukit timah', 'rochor', 'tanglin', 'toa payoh', 'aljunied', 'joo seng', 'clover', 'berjaya', 'binchang', 'pemimpin', 'commonwealth', 'upper thomson', 'upper thompson', 'bishan', 'geylang', 'soo chow', 'holland', 'telok blangah', 'whampoa', 'kallang', 'marine parade', 'bendemeer', 'bahagia', 'ubi', 'boon keng', 'veerasamy', 'ghim moh', 'scotts', 'bukit merah', 'tiong bahru', 'redhill', 'farrer', 'claymore', 'waterloo', 'bidadari', 'circuit', 'race course', 'rangoon', 'potong pasir', 'dakota', 'balam', 'pipit', 'stirling', 'genting'],

#     'North': ['woodlands', 'sembawang', 'yishun', 'admiralty', 'belibas', 'canberra', 'marsiling', 'simpang', 'mandai', 'sungei kadut', 'lim chu kang', 'kranji', 'montreal'],

#     'West': ['boon lay', 'bukit batok', 'choa chu kang', 'clementi', 'jurong east', 'jurong west', 'pioneer', 'tuas', 'hillview', 'pending', 'bukit panjang', 'tengah', 'pioneer', 'senja', 'teck whye', 'fajar', 'segar', 'yew tee', 'keat hong', 'brickland', 'petir', 'gangsa', 'kang ching', 'bangkit', 'pandan']
# }


# def assign_region(street_address):
#     for region, locations in region_mapping.items():
#         if any(location in street_address.lower() for location in locations):
#             return region
#     return 'Other'

# # Apply the function to create a new 'region' column
# dengue_df['region'] = dengue_df['street_address'].apply(assign_region)

# dengue_df.head()

In [145]:
#getting only the 'others' region

# others_df = dengue_df[dengue_df['region'] == 'Other']
# unique_add = others_df['street_address'].unique()   

# others_df

In [146]:
# with open(os.path.join(PROJ_DIR_PATH, 'others_ulu_names.txt'), 'w') as f:
#           for add in unique_add:
#                   f.write(f"{add}\n")

# f.close()

#### Nearest MRT Stations

bunga rampai - khaki bukit 
joo seng - tai seng
vernon - kovan
clover - marymount
geylang east - aljuneid
jalan berjaya - bradell
jalan binchang - bishan
jalan pemimpin - marymount
kew - marymount
jalan belibas - upper thompson
tanjong rhu - tanjong rhu

In [147]:
### PART 2 
region_mapping = {
    'Central': [
        'marina bay', 'raffles', 'outram', 'sentosa', 'orchard', 'river valley', 'newton', 'novena', 'bukit timah', 
        'rochor', 'tanglin', 'toa payoh', 'aljunied', 'joo seng', 'clover', 'berjaya', 'binchang', 'pemimpin', 
        'commonwealth', 'upper thomson', 'bishan', 'geylang', 'soo chow', 'holland', 'telok blangah', 'whampoa', 
        'kallang', 'marine parade', 'bendemeer', 'bahagia', 'ubi', 'boon keng', 'veerasamy', 'ghim moh', 'scotts', 
        'bukit merah', 'tiong bahru', 'redhill', 'farrer', 'claymore', 'waterloo', 'bidadari', 'circuit', 'race course', 'faber',
        'rangoon', 'potong pasir', 'dakota', 'balam', 'pipit', 'stirling', 'genting', 'lorong stangee', 'fidelio', 
        'jalan tua kong', 'pulasan', 'jalan khairuddin', 'lorong selangat', 'lorong abu talib', 'desker', 'rowell', 'holt', 'keppel', 'marymount',
        'carmen', 'tyrwhitt', 'beatty', 'cuff', 'chander', 'jalan usaha', 'jalan kurnia', 'conway', 'jalan chermai', 
        'jalan kenarah', 'jalan lekub', 'jalan rengas', 'fulton', 'loyang', 'thrift', 'amber', 'jalan besar', 'carlisle', 'boon tiong', 'ho swee', 'tahar',
        'jalan rumah tinggi', 'kaki bukit avenue 3', 'sims avenue 3', 'thomson', 'lorong puntong', 'wan tho','dover', 'rose', 'bouna vista',
        'klang', 'upper weld', 'petain', 'sturdee', 'carisbrooke', 'chiselhurst', 'guillemard', 'sims', 'westerhout', 'le salle', 'maria', 'jalan chempedak', 'jalan chengam', 'jalan lanjut', 'jalan menarong', 'jalan rukam', 'saint nicholos view', 'jalen kechot', 'teo kim', 'kim keat', 'dyson', 'maju', 'syed alwi', 'braddell', 'jalan grisek', 'jalan paras','sam leong', 'cedarwood', 'la salle', 'saint nicholas view', 'jalan kechot', 'macpherson', 'siang kuang', 'alexandra','shunfu', 'jalan ampas', 'beach', 'jalan ma\'mor', 'jalan rajah', 'fraser', 'mattar', 'pheng geck', 'puay hee', 'tai thong crescent', 'shaw', 'havelock', 'jellicoe', 'king george\'s', 'cardiff', 'coniston', 'golden', 'chiap guan', 'dix', 'ee teow leng', 'flower', 'hendry', 'highland', 'hillside', 'jansen', 'jalan sahabat', 'kovan', 'lange', 'palm grove', 'lucky view', 'sennett', 'riviera', 'how sun', 'jalan kesoma', 'jalan rindu', 'lorong how sun', 'lorong ong lye', 'jalan masjid', 'jalan wakaff', 'sims view', 'jalan khamis', 'jalan rabu', 'elite terrace', 'mcnair', 'towner', 'mei hwan', 'jalan berseh', 'kelantan', 'jalan cherpen', 'jalan malu-malu', 'jalan hikayat', 'jalan kemuning', 'jalan shaer', 'langsat', 'rambutan', 'chuan', 'cotswold', 'chiltern', 'ceylon', 'fowlie', 'marshall', 'mugliston', 'seraya', 'gilstead', 'kim keat', 'jalan kelabu asap', 'jalan rumia', 'corporation rise', 'jalan greja', 'jalan serengam', 'jalan telipok', 'jalan taman', 'saint francis', 'sunbird', 'dafne', 'dido', 'jalan hock chye', 'jalan batai', 'sennett', 'sirat', 'braemar', 'corfe', 'goodman', 'delta', 'indus', 'dunman', 'eden grove', 'sunshine terrace', 'jalan tari serimpi', 'saujana', 'brookvale', 'bukit purmei', 'gambas crescent', 'haig', 'hazel park terrace', 'jalan chulek', 'kensington park', 'may', 'almond crescent', 'irrawaddy', 'lim tua tow', 'arthur', 'bournemouth', 'broadrick', 'clacton', 'crescent', 'fort', 'jalan seaview', 'jalan sedap', 'margate', 'mayfield', 'ramsgate', 'ringwood', 'kitchener link', 'tong watt', 'unity', 'lorong bachok', 'sims way', 'dunlop', 'hindoo', 'kelantan', 'lembu', 'norris', 'adis', 'handy', 'mount emily', 'mount sophia', 'niven', 'sophia', 'wilkie', 'ellington square', 'kim tian', "saint george's", 'cambridge', 'cairnhill', 'north bridge', 'jalan membina', 'lengkok', 'saint thomas', 'balmoral', 'henderson', 'cantonment', 'quay', 'dusun', 'killiney', 'shanghai', 'mutiara', 'tenteram', 'sarkies', 'mamor', 'spottiswoode', 'devonshire', 'kim yam', 'buffalo', 'hertford', 'grange', 'jervois', 'oxley', 'derbyshire', 'woodleigh', 'elizabeth', 'nathan', 'mount sinai', 'rama', 'merpati', 'merryn', 'raja udang', 'shan', 'leedon', 'crawford', 'moonstone', 'enggor', 'shelford', 'spooner', 'kuala', 'martin', 'kinta', 'keng lee', 'clarence', 'one tree hill', 'hindhede', 'chin swee', 'mat jambol', 'toh yi', 'buona vista', 'pisang', 'selanting', 'essex', 'richards', 'yung ho', 'ardmore'
    ],

    'East': [
        'bedok', 'changi', 'pasir ris', 'tampines', 'simei', 'kew', 'geylang east', 'bunga rampai', 'tanjong rhu', 
        'chai chee', 'siglap', 'eunos', 'tanah merah', 'marine', 'paya lebar', 'jalan tenaga', 'changi bay', 
        'telok kurau', 'joo chiat', 'limau', 'marine terrace', 'meyer', 'duku', 'elias', 'tembeling', 'carmen', 
        'tua kong', 'pulasan', 'jalan khairuddin', 'lorong abu talib', 'woo mon chew', 'jalan chermai', 'jalan kenarah', 
        'jalan lekub', 'jalan rengas', 'kaki bukit industrial terrace', 'fulton', 'loyang rise', 'thrift', 'summer', 
        'jalan kayu', 'amber', 'belgravia', 'dairy farm', 'jalan besar', 'jalan rumah tinggi', 'kaki bukit avenue 3', 
        'sims avenue 3', 'upper east coast', 'elliot', 'east coast', 'marine crescent', 'sims avenue east', 'toh guan east',
        'eastwood', 'tay lian teck', 'foo kim lin', 'lengkong tujoh', 'happy east', 'still', 'jalan baiduri', 'jalan kemboja', 'kee choe', 'lorong bunga', 'jalan lateh', 'mountbatten', 'kurau', 'jalan ishak', 'jalan ismail', 'lorong marican', 'lorong salleh', 'hecienda grove', 'jalan pelatina', 'taman permata', 'jalan molek', 'jalan suka', 'lorong bandang', 'jalan yasin', 'lorong marzuki', 'lorong sarina', 'jalan punai', 'jalan chempaka puteh', 'frankel', 'hartley grove', 'roseburn', 'sims', 'defu', 'lakme terrace', 'tosca', 'guan soon', 'sea breeze', 'avon', 'branksome', 'kaki bukit', 'senang', 'jalan damai','hacienda grove', 'nallur', 'carpmael', 'onan', 'berwick', 'medway', 'jalan nuri', 'walton', 'riverina', 'blandford', 'burghley', 'chartwell', 'jalan terubok', 'bodmin', 'bloxhome', 'east coast', 'fernwood terrace', 'haig', 'ipoh', 'jalan hussein', 'jalan terang bulan', 'metropole', 'norma', 'colchester', 'jalan jamal', 'lucky heights', 'jalan sempadan', 'tay lian teck', 'jalan bangau', 'jalan jarak', 'jalan kathi', 'saint patrick\'s', 'jalan melor', 'east aida', 'tanjong katong', 'bowmont', 'burnfoot', 'carlton', 'cheviot hill', 'dryburgh', 'ettrick', 'greenfield', 'jedburgh', 'wilton', 'yarrow', 'lorong melayu', 'lorong mydin', 'bayshore', 'wilkinson', 'jalan hajijah', 'jalan hitam manis', 'jalan merah saga', 'joo', 'jalan klinik', 'kent', 'mergui', 'owen', 'oxford', 'perumal', 'sing', 'starlight', 'tessensohn', 'aida', 'elite park', 'sims view', 'jalan khamis', 'jalan rabu', 'elite terrace', 'mcnair', 'towner', 'mei hwan', 'jalan berseh', 'kelantan', 'jalan cherpen', 'jalan malu-malu', 'jalan hikayat', 'jalan kemuning', 'jalan shaer', 'langsat', 'rambutan', 'chuan', 'cotswold', 'chiltern', 'ceylon', 'fowlie', 'marshall', 'mugliston', 'seraya', 'gilstead', 'kim keat', 'jalan kelabu asap', 'jalan rumia', 'corporation rise', 'jalan greja', 'jalan serengam', 'jalan telipok', 'jalan taman', 'saint francis', 'sunbird', 'dafne', 'dido', 'jalan hock chye', 'jalan batai', 'sennett', 'sirat', 'braemar', 'corfe', 'goodman', 'delta', 'indus', 'dunman', 'eden grove', 'sunshine terrace', 'jalan tari serimpi', 'saujana', 'brookvale', 'bukit purmei', 'gambas crescent', 'haig lane', 'hazel park terrace', 'jalan chulek', 'kensington park', 'may', 'almond crescent', 'irrawaddy', 'lim tua tow', 'arthur', 'bournemouth', 'broadrick', 'clacton', 'crescent', 'fort', 'jalan seaview', 'jalan sedap', 'margate', 'mayfield', 'ramsgate', 'ringwood', 'lengkong', 'flora', 'daud', 'martia', 'chempaka', 'kembangan', 'dedap', 'happy', 'yun ping', 'dunbar walk'
    ],


    'North-East': [
        'ang mo kio', 'hougang', 'punggol', 'sengkang', 'serangoon', 'vernon', 'fernvale', 'compassvale', 'seletar', 'selatar', 'meragi',
        'sembilan', 'edgefield', 'rivervale', 'anchorvale', 'yio chu kang', 'buangkok', 'sin ming', 'bright hill', 'kio',
        'jalan khairuddin', 'sin ming', 'jalan lembah thomson', 'jalan besar', 'jalan rumah tinggi', 'aroozoo', 'seranggon', 'lorong ah soo', 'surin', 'edgedale plains', 'kampong sireh', 'lorong 6 realty park', 'lorong lew lian', 'lorong how sun', 'jalan selaseh', 'jalan sindor', 'kelulut hill', 'mio', 'jalan pacheli', 'li hwan', 'tai hwan', 'jalan krian', 'corporation', 'golden', 'brighton crescent', 'chepstow close', 'chuan garden', 'lichfield', 'ripley crescent', 'walmer', 'leicester', 'meyappa chettiar', 'woodsville', 'bartley', 'gambir', 'jalan labu ayer', 'jalan labu manis', 'jalan labu merah', 'lorong gambir', 'lorong penchalak', 'jalan gelenggang', 'ang mio kio', 'jalan leban', 'jalan kuras', 'jalan tarum', 'serenade', 'jalan dua', 'old airport', 'clifton vale', 'croucher', 'jalan girang', 'jalan ria', 'jalan riang', 'lorong biawak', 'matlock rise', 'melrose', 'sommerville', 'sunridge park', 'vaughan', 'wolskel', 'still', 'cassia crescent', 'jalan tiga', 'pine close', 'mayflower', 'shangri-la', 'daisy', 'mackerrow', 'chiku', 'everitt', 'koon seng', 'mangis', 'rambai', 'teng tong', 'sumang', 'rajawali', 'mimosa', 'sunrise', 'saraca', 'stratton', 'realty', 'begonia', 'nim', 'angklong', 'pelikat', 'lilac', 'terrase', 'casuarina', 'terrasse'
    ],

    'North': [
        'woodlands', 'sembawang', 'yishun', 'admiralty', 'belibas', 'canberra', 'marsiling', 'simpang', 'mandai', 
        'sungei kadut', 'lim chu kang', 'kranji', 'montreal', 'jalan kurnia', 'jalan usaha', 'conway', 'miltonia close', 'lompang', 'jalan kelabu asap', 'jalan rumia', 'corporation rise', 'jalan greja', 'jalan serengam', 'jalan telipok', 'jalan taman', 'saint francis', 'sunbird circle', 'sunbird', 'butterworth lane', 'haig', 'ipoh lane', 'dafne street', 'dido street', 'jalan hock chye', 'jalan batai', 'sennett', 'sirat', 'braemar', 'corfe', 'goodman', 'delta', 'indus', 'dunman', 'east coast terrace', 'eden grove', 'sunshine terrace', 'jalan tari serimpi', 'mei ling', 'saujana', 'tai hwan close', 'ava', 'brookvale', 'bukit purmei', 'gambas crescent', 'haig lane', 'hazel park terrace', 'cashew', 'jalan chulek', 'kensington park', 'may', 'mcnair', 'almond crescent', 'irrawaddy', 'lim tua tow', 'arthur', 'bournemouth', 'broadrick', 'clacton', 'crescent', 'fort', 'jalan seaview', 'jalan sedap', 'margate', 'mayfield', 'ramsgate', 'ringwood', 'florissa', 'lentor', 'eaton', 'wellington', 'woodgrove', 'senoko', 'almond'
    ],

    'West': [
        'boon lay', 'bukit batok', 'choa chu kang', 'clementi', 'jurong east', 'jurong west', 'pioneer', 'tuas',
        'hillview', 'pending', 'bukit panjang', 'tengah', 'senja', 'teck whye', 'fajar', 'segar', 'yew tee', 'keat hong',
        'brickland', 'petir', 'gangsa', 'kang ching', 'bangkit', 'pandan', 'jalan rumah tinggi', 'jalan jurong kechil',
        'west coast place', 'west coast terrace', 'fulton', 'belgravia', 'dairy farm', 'kaki bukit avenue 3', 'toh guan',
        'jurong lake link', 'yunnan', 'tah ching', 'yung an', 'toh tuck', 'toh yi drive', 'jalan kelichap', 'jalan lokam', 'tai keng', 'pillai', 'mei ling', 'bridport', 'cowdray', 'farleigh', 'hemsley', 'huddington', 'kingswear', 'portchester', 'dorset', 'gloucester', 'oxford', 'da silva', 'florence', 'kang choo bin', 'lim ah pin', 'poh huat', 'simon', 'khiang guan', 'lincoln', 'suffolk', 'surrey', 'flanders', 'kitchener', 'marne', 'somme', 'alnwick', 'raglan', 'chuan', 'li hwan', 'hemmant', 'lim ah woo', 'foch', 'lavender', 'havelock', 'jalan kukoh', 'jalan minyak', 'york hill', 'guillemard', 'pine close', 'coldstream', 'jalan keris', 'jalan selendang delima', 'keris', 'chuan hoe', 'limbok', 'parry', 'jalan salang', 'kwong', 'jalan batu', 'kampong arang', 'kampong kayu', 'jalan rajah', 'fishery port', 'charlton', 'kampung sireh', 'surin', 'jalan berseh', 'kelantan', 'jalan sendudok', 'margaret', 'eng kong', 'lorong kismis', 'upper toh tuck', 'tavistock', 'ah hood', 'balestier', 'boon teck', 'jalan kemaman', 'park villas rise', 'poh huat west', 'sims close', 'marine vista', 'jalan bangsawan', 'crane', 'haigsville drive', 'pennefather', 'balmoral park', 'ewe boon', 'keng chin', 'robin drive', 'robin', 'stevens drive', 'mar thoma', 'saint michael\'s', 'saint helier\'s', 'chancery hill', 'chancery hill walk', 'chancery lane', 'tan sim boh', 'cheng soon garden', 'kismis green', 'kismis', 'jalan rengkam', 'jalan gembira', 'pasir panjang hill', 'pasir panjang', 'cashew', 'cavenagh', 'hythe', 'worthing', 'silat avenue', 'akyab', 'bassein', 'mandalay', 'hong lee place', 'leith park', 'leith', 'rosyth', 'recreation', 'dawson', 'strathmore avenue', 'queen', 'bukit tunggal', 'buckley', 'evelyn', 'gentle', 'jalan wakaff', 'court', 'french', 'king george\'s', 'maude', 'penhas', 'selegie', 'indus', 'jalan chegar', 'jalan hari raya', 'jalan pintau', 'saint george\'s lane', 'tagore', 'begonia terrace', 'tamarind', 'lompang', 'ho ching', 'yung kuang', 'yung sheng', 'jalan selamat', 'lorong kembangan', 'leonie hill', 'queensway', 'delta avenue', 'saint patrick\'s', 'gopeng', 'tanjong pagar plaza', 'henderson crescent', 'tah ching', 'jelapang', 'teban', 'west coast', 'jelebu', 'yuan ching', 'hume', 'westwood', 'chestnut', 'belimbing', 'kian teck', 'penjuru', 'lakeside', 'phoenix', 'pavillion', 'hong san', 'pavilion'
    ]

}


def assign_region(street_address):
    for region, locations in region_mapping.items():
        if any(location in street_address.lower() for location in locations):
            return region
    return 'Other'

# Apply the function to create a new 'region' column
dengue_df['region'] = dengue_df['street_address'].apply(assign_region)

dengue_df.head()

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year,region
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3,2017,East
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3,2017,East
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9,2015,East
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9,2015,Central
4,3,mount vernon road,1.340377,103.87949,1,1,32,25/09/2015,9,2015,North-East


In [148]:
#getting only the 'others' region

others_df = dengue_df[dengue_df['region'] == 'Other']
unique_add = others_df['street_address'].unique()   

others_df.to_csv('others_df.csv')
others_df

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year,region
2563,2,arnasalam chetty road (the inspira),1.291872,103.838822,4,35,243,07/08/2020,8,2020,Other
2565,1,dublin road,1.298394,103.839734,4,35,243,07/08/2020,8,2020,Other
2568,3,jalan rumbia (the imperial),1.295319,103.842752,4,35,243,07/08/2020,8,2020,Other
2577,6,lloyd road (the botanic on lloyd),1.297039,103.839642,4,35,243,07/08/2020,8,2020,Other
2579,13,mohamed sultan road (the pier @ robertson),1.291818,103.840650,4,35,243,07/08/2020,8,2020,Other
...,...,...,...,...,...,...,...,...,...,...,...
56868,2,jalan layang layang,1.344837,103.766629,23,3,5,01/08/2016,8,2016,Other
56887,1,balmeg hill (the peak@balmeg),1.283135,103.783350,30,1,3,01/08/2016,8,2016,Other
56890,1,ah soo garden,1.350317,103.883956,31,1,2,01/08/2016,8,2016,Other
56905,1,jalan redop,1.385410,103.870633,38,2,2,01/08/2016,8,2016,Other


In [149]:
with open(os.path.join(PROJ_DIR_PATH, 'others_ulu_names.txt'), 'w') as f:
          for add in unique_add:
                  f.write(f"{add}\n")

f.close()

In [150]:
#remove common words such as 'road' etc, remove (), and anything within it
def clean_elements(elements):
    # Define the words to be removed and compile a regex pattern for matching
    remove_words = ['block', 'road', 'avenue', 'street', 'drive', 'place', 'walk', 'lane']
    pattern = re.compile(r'\b(?:' + '|'.join(remove_words) + r')\b|\(.*?\)|\d+')

    # Remove unwanted words, parentheses and their contents, and numbers from each element in the list
    cleaned_elements = [pattern.sub('', element).strip() for element in elements]

    return cleaned_elements

unique_add = clean_elements(unique_add)

freq_add = Counter(unique_add)


In [151]:
#save the file 


dengue_df.to_csv('dengue_df.csv')

### Water Bodies

In [152]:
# import pandas as pd

# # Load the CSV file
# # file_path = "/mnt/data/dengue_df.csv"
# # dengue_df = pd.read_csv(file_path)

# # Define a dictionary with water bodies by region
# water_bodies = {
#     'West': ['Bukit Batok Canal', 'Chestnut Drive Outlet Drain', 'Jurong Lake', 'Pandan Reservoir', 'Pang Sua Canal', 'Pang Sua Pond', 'Sungei Pandan', 'Sungei Pandan Kechil', 'Sungei Ulu Pandan', 'Sungei Ulu Pandan (Clementi St 14 to Sungei Pandan)'],
#     'North': ['Kranji Reservoir', 'Lower Seletar Reservoir - Family Bay', 'Lower Seletar Reservoir - Rower\'s Bay', 'Sungei Simpang Kanan', 'Yishun Pond'],
#     'Central': ['Alexandra Canal', 'Alexandra Canal Subsidiary Drain \'A\'', 'Alexandra Sub Drain F', 'Geylang River', 'Geylang River (Mountbatten Rd to Singapore Indoor Stadium)', 'Holland Plain', 'Kallang River (Bishan - Ang Mo Kio Park)', 'Kallang River (Bishan - Braddell)', 'Kallang River (Potong Pasir)', 'Kallang River (Upp Boon Keng - Sims Ave)', 'Kolam Ayer ABC Waterfront', 'MacRitchie Reservoir', 'Rochor Canal', 'Sungei Kallang - River Vista', 'Sungei Whampoa (Kim Keat Rd - CTE)', 'Sungei Whampoa (St George\'s Ln to Serangoon Rd)', 'Sungei Whampoa - St George\'s Lane'],
#     'North-East': ['Hougang Ave 10', 'Sengkang Floating Wetland', 'Serangoon Reservoir', 'Sungei Pinang'],
#     'East': ['Bedok Reservoir', 'Lorong Halus Wetland', 'Siglap Canal', 'Siglap Canal (ECP to the Sea)', 'Sungei Api Api', 'Sungei Tampines', 'Sungei Tampines (Tampines Ave 7 to TPE)']
# }

# # Function to check if an address is near any water body in its respective region
# def is_near_water_body(address, region):
#     if region in water_bodies:
#         for water_body in water_bodies[region]:
#             if water_body.lower() in address.lower():
#                 return 'Yes'
#     return 'No'

# # Apply the function to create a new column
# dengue_df['near_water_body'] = dengue_df.apply(lambda row: is_near_water_body(row['street_address'], row['region']), axis=1)




# # Show the first few rows of the updated dataframe
# dengue_df


In [153]:
# yes_water = dengue_df[dengue_df['near_water_body'] == 'Yes']

# yes_water

### Rainfall Dengue Merged

In [154]:
# Load the actual rainfall dataset

rainfall_df = pd.read_csv('climate_final.csv')

# Display basic information about the dataset
rainfall_df.head()



Unnamed: 0,Region,Area,Date,Daily_Rainfall,Min_Temperature (°C),Max_Temperature (°C),Mean_Temperature (°C)
0,North,Admiralty,01-01-2020,0.0,27.5,31.1,25.4
1,North,Admiralty,02-01-2020,0.0,27.4,30.3,25.0
2,North,Admiralty,03-01-2020,0.2,27.5,30.4,25.7
3,North,Admiralty,04-01-2020,7.0,26.7,30.3,25.2
4,North,Admiralty,05-01-2020,0.0,27.6,31.6,25.1


In [155]:
rainfall_df = rainfall_df.rename(columns={'Region': 'region'})

rainfall_df.to_csv('rainfall_data.csv')

In [156]:
dengue_df.head()

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year,region
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3,2017,East
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3,2017,East
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9,2015,East
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9,2015,Central
4,3,mount vernon road,1.340377,103.87949,1,1,32,25/09/2015,9,2015,North-East


In [157]:
dengue_df['year'].max()

'2020'

In [158]:
# # Data cleaning

# # Convert 'Date' to datetime
# rainfall_df['Date'] = pd.to_datetime(rainfall_df['Date'], format='%d-%m-%Y', errors='coerce')

# # Convert numerical columns to float
# rainfall_df['Daily_Rainfall'] = pd.to_numeric(rainfall_df['Daily_Rainfall'], errors='coerce')
# rainfall_df[['Mean_Temperature', 'Max_Temperature', 'Min_Temperature']] = rainfall_df['Mean,Max,Min Temperature (°C)'].str.split(',', expand=True).apply(pd.to_numeric, errors='coerce')
# rainfall_df[['Mean_Wind_Speed', 'Max_Wind_Speed']] = rainfall_df['Mean,Max Wind Speed (km/h)'].str.split(',', expand=True).apply(pd.to_numeric, errors='coerce')

# # Drop unnecessary columns
# rainfall_df.drop(columns=['Station', 'Highest Rainfall (mm) (30,60,120) min', 'Mean,Max,Min Temperature (°C)', 'Mean,Max Wind Speed (km/h)'], inplace=True)

# # Drop rows with missing dates
# rainfall_df.dropna(subset=['Date'], inplace=True)

# # Adding year and month columns for aggregation
# rainfall_df['Year'] = rainfall_df['Date'].dt.year
# rainfall_df['Month'] = rainfall_df['Date'].dt.month

# # Aggregating data to monthly averages
# monthly_rainfall = rainfall_df.groupby(['Year', 'Month']).agg({
#     'Daily_Rainfall_Total': 'mean',
#     'Mean_Temperature': 'mean'
# }).reset_index()

# monthly_rainfall.columns = ['year', 'month_num', 'avg_rainfall_mm', 'avg_temperature_celsius']

# # Displaying the cleaned and aggregated dataset
# monthly_rainfall.head()

In [159]:
# #SOCIOECONOMIC

# # Load the uploaded dataframe

# # List of known elderly-dense areas in Singapore
# elderly_dense_areas_keywords = [
#     "bedok", "bukit merah", "queenstown", "toa payoh", 
#     "ang mo kio", "marine parade", "yishun", "tampines", 
#     "geylang", "kampong glam", "jurong east", "bukit batok",
#     "bukit panjang", "serangoon", "hougang", "changi", "pasir ris", 
#     "woodlands", "sengkang", "punggol", "novena", "tanglin", 
#     "bukit timah", "clementi", "jurong west",
#     "bencoolen", "bishan", "braddell", "eunos", "kallang",
#     "potong pasir", "redhill", "tanjong pagar", "telok blangah",
#     "tech whye", "ghim moh", "dover", "redhill", "tiong bahru", "bukit ho swee"
# ]

# # Function to determine if the area is elderly-dense with more contextual knowledge
# def is_elderly_dense_accurate(street_address):
#     address = street_address.lower()
#     if any(area in address for area in elderly_dense_areas_keywords):
#         return 1
#     else:
#         return 0

# dengue_df['elderly_dense'] = dengue_df['street_address'].apply(is_elderly_dense_accurate)


# dengue_df


In [160]:
import pandas as pd
from geopy.distance import geodesic
from sklearn.neighbors import BallTree
import numpy as np

# Load the existing dataframe


# Updated list of  stops with their coordinates for better accuracy
mrt_stops_detailed = {
    'Pasir Ris': (1.3732, 103.9496),
    'Tampines': (1.3541, 103.9457),
    'Bedok': (1.3244, 103.9297),
    'Kallang': (1.3114, 103.8713),
    'Aljunied': (1.3164, 103.8821),
    'Boon Lay': (1.3406, 103.7064),
    'Pioneer': (1.3374, 103.6974),
    'Jurong East': (1.3331, 103.7422),
    'Bukit Batok': (1.3491, 103.7498),
    'Clementi': (1.3154, 103.7653),
    'Redhill': (1.2894, 103.8161),
    'Tiong Bahru': (1.2862, 103.8261),
    'Queenstown': (1.2946, 103.8054),
    'Commonwealth': (1.3020, 103.7983),
    'Dover': (1.3114, 103.7783),
    'Buona Vista': (1.3077, 103.7890),
    'Paya Lebar': (1.3181, 103.8943),
    'Eunos': (1.3194, 103.9038),
    'Kembangan': (1.3201, 103.9121),
    'Tanah Merah': (1.3273, 103.9460),
    'Expo': (1.3342, 103.9622),
    'Changi Airport': (1.3571, 103.9873),
    'Lavender': (1.3077, 103.8631),
    'Bugis': (1.3005, 103.8564),
    'City Hall': (1.2930, 103.8514),
    'Raffles Place': (1.2839, 103.8511),
    'Tanjong Pagar': (1.2764, 103.8442),
    'Outram Park': (1.2785, 103.8390),
    'Chinatown': (1.2857, 103.8442),
    'Clarke Quay': (1.2883, 103.8465),
    'Dhoby Ghaut': (1.2989, 103.8463),
    'Somerset': (1.3000, 103.8378),
    'Orchard': (1.3035, 103.8325),
    'Newton': (1.3135, 103.8393),
    'Novena': (1.3200, 103.8431),
    'Toa Payoh': (1.3326, 103.8470),
    'Braddell': (1.3411, 103.8475),
    'Bishan': (1.3516, 103.8481),
    'Ang Mo Kio': (1.3692, 103.8490),
    'Yio Chu Kang': (1.3818, 103.8442),
    'Khatib': (1.4174, 103.8321),
    'Yishun': (1.4294, 103.8354),
    'Sembawang': (1.4491, 103.8207),
    'Canberra': (1.4432, 103.8298),
    'Admiralty': (1.4406, 103.8006),
    'Woodlands': (1.4365, 103.7863),
    'Marsiling': (1.4322, 103.7746),
    'Kranji': (1.4250, 103.7622),
    'Yew Tee': (1.3973, 103.7473),
    'Choa Chu Kang': (1.3855, 103.7442),
    'Bukit Gombak': (1.3588, 103.7515),
    'Hillview': (1.3621, 103.7656),
    'Beauty World': (1.3418, 103.7755),
    'King Albert Park': (1.3350, 103.7831),
    'Sixth Avenue': (1.3291, 103.7970),
    'Tan Kah Kee': (1.3253, 103.8077),
    'Botanic Gardens': (1.3225, 103.8151),
    'Farrer Road': (1.3176, 103.8072),
    'Holland Village': (1.3112, 103.7963),
    'Buona Vista': (1.3077, 103.7890)
}

# Convert the detailed MRT stops dictionary to a list of coordinates and names
mrt_coordinates_detailed = np.array(list(mrt_stops_detailed.values()))
mrt_names_detailed = list(mrt_stops_detailed.keys())

# Create a BallTree for efficient nearest neighbor search
tree_detailed = BallTree(np.radians(mrt_coordinates_detailed), metric='haversine')

# Function to find the nearest MRT stop using the BallTree
def find_nearest_mrt_detailed(latitude, longitude):
    distance, index = tree_detailed.query(np.radians([(latitude, longitude)]), k=1)
    nearest_stop = mrt_names_detailed[index[0][0]]
    return nearest_stop

# Apply the function to each row in the dataframe
dengue_df['mrt_stop'] = dengue_df.apply(lambda row: find_nearest_mrt_detailed(row['latitude'], row['longitude']), axis=1)

# List of known elderly-dense areas in Singapore
elderly_dense_areas_keywords = [
    "bedok", "bukit merah", "queenstown", "toa payoh", 
    "ang mo kio", "marine parade", "yishun", "tampines", 
    "geylang", "kampong glam", "jurong east", "bukit batok",
    "bukit panjang", "serangoon", "hougang", "changi", "pasir ris", 
    "woodlands", "sengkang", "punggol", "novena", "tanglin", 
    "bukit timah", "clementi", "jurong west",
    "bencoolen", "bishan", "braddell", "eunos", "kallang",
    "potong pasir", "redhill", "tanjong pagar", "telok blangah",
    "tech whye", "ghim moh"
]

# Additional keywords associated with elderly-dense areas
elderly_facilities_keywords = [
    "senior", "elderly", "geriatric", "nursing home", "retirement village",
    "care center", "senior activity center", "active ageing hub"
]

# Including additional elderly-dense areas similar to Ghim Moh and Tech Whye
additional_elderly_dense_areas = [
    "bukit timah", "dover", "redhill", "telok blangah", 
    "tiong bahru", "bukit ho swee"
]

# Update the list of keywords for elderly-dense areas
elderly_dense_areas_keywords.extend(additional_elderly_dense_areas)

# Function to determine if the area is elderly-dense with more contextual knowledge
def is_elderly_dense_accurate(street_address):
    address = street_address.lower()
    if any(area in address for area in elderly_dense_areas_keywords):
        return 1
    elif any(facility in address for facility in elderly_facilities_keywords):
        return 1
    else:
        return 0

# Apply the enhanced function to each row in the dataframe
dengue_df['elderly_dense'] = dengue_df['street_address'].apply(is_elderly_dense_accurate)

dengue_df


Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year,region,mrt_stop,elderly_dense
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3,2017,East,Pasir Ris,1
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3,2017,East,Pasir Ris,1
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9,2015,East,Aljunied,0
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9,2015,Central,Aljunied,0
4,3,mount vernon road,1.340377,103.879490,1,1,32,25/09/2015,9,2015,North-East,Aljunied,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56971,1,chai chee street (block 43),1.328380,103.925500,3,2,3,15/05/2017,5,2017,East,Bedok,0
56972,2,chai chee street (block 45),1.328806,103.924377,3,2,3,15/05/2017,5,2017,East,Bedok,0
56973,2,kang ching road (block 339d),1.338952,103.722240,4,2,2,15/05/2017,5,2017,West,Boon Lay,0
56974,1,lorong 4 toa payoh (block 60),1.336470,103.850664,5,2,2,15/05/2017,5,2017,Central,Toa Payoh,1


In [161]:
#analysing the effectiveness of the 'Enhanced Vector Control Measures - Wolbachia' (started in 2016) and 'Gravitraps Deployment (GD)' (started in 2017)

#converting year column to int
dengue_df['year'] = dengue_df['year'].astype(int)

gd_stops= ['Serangoon', 'Kovan', 'Newton', 'Novena', 'Bedok', 'Tampinese', 'Jurong East', 'Clementi']
evcm_stops = ['Punggol', 'Sengkang', 'Hougang', 'Toa Payoh', 'Ang Mo Kio', 'Bishan']
policies = []

for row in dengue_df.itertuples():
    if row.mrt_stop in gd_stops and (row.year >= 2017):
        policies.append('Gravitraps Deployment')
    elif row.mrt_stop in evcm_stops and (row.year >= 2016):
        policies.append('Wolbachia')
    else:
        policies.append('Others')

dengue_df['policies'] = policies

dengue_df

Unnamed: 0,num_cases,street_address,latitude,longitude,cluster_num,recent_cases_in_cluster,total_cases_in_cluster,date,month_num,year,region,mrt_stop,elderly_dense,policies
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,20/03/2017,3,2017,East,Pasir Ris,1,Others
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,20/03/2017,3,2017,East,Pasir Ris,1,Others
2,2,bunga rampai place,1.338931,103.883537,1,1,32,25/09/2015,9,2015,East,Aljunied,0,Others
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,25/09/2015,9,2015,Central,Aljunied,0,Others
4,3,mount vernon road,1.340377,103.879490,1,1,32,25/09/2015,9,2015,North-East,Aljunied,0,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56971,1,chai chee street (block 43),1.328380,103.925500,3,2,3,15/05/2017,5,2017,East,Bedok,0,Gravitraps Deployment
56972,2,chai chee street (block 45),1.328806,103.924377,3,2,3,15/05/2017,5,2017,East,Bedok,0,Gravitraps Deployment
56973,2,kang ching road (block 339d),1.338952,103.722240,4,2,2,15/05/2017,5,2017,West,Boon Lay,0,Others
56974,1,lorong 4 toa payoh (block 60),1.336470,103.850664,5,2,2,15/05/2017,5,2017,Central,Toa Payoh,1,Wolbachia


In [162]:
dengue_df.to_csv('dengue_df_final.csv')