# Building and Property Violations 
This notebook is used to clean the dataset of building and property violations: https://data.boston.gov/dataset/building-and-property-violations1/resource/800a2663-1d6a-46e7-9356-bedb70f5332c

Our goal is to gather violations from the years 2018-2024. 

In [None]:
import pandas as pd 

df = pd.read_csv("building_property_violations.csv")

df.head()

Unnamed: 0,_id,case_no,ap_case_defn_key,status_dttm,status,code,value,description,violation_stno,violation_sthigh,...,ward,contact_addr1,contact_addr2,contact_city,contact_state,contact_zip,sam_id,latitude,longitude,location
0,1,V91983,1013,,Closed,121.2,,Unsafe and Dangerous,302,,...,1,302 Sumner St,,East Boston,MA,2128,132380.0,42.367678,-71.03658,"(42.367678491254956, -71.0365803778755)"
1,2,V876069,1013,2025-12-04 15:48:22,Open,102.8,,Maintenance,46,,...,15,46 ADAMS,,DORCHESTER,MA,2122,726.0,42.30717,-71.062061,"(42.307169528727265, -71.06206056969255)"
2,3,V876068,1013,2025-12-04 15:27:46,Open,102.8,,Maintenance,70,,...,18,70 BIRCHCROFT RD,,HYDE PARK,MA,2136,15647.0,42.26694,-71.109811,"(42.26693960061158, -71.1098107673119)"
3,4,V876066,1013,2025-12-04 15:04:58,Open,102.8,,Maintenance,633,,...,7,707 NEPONSET STREET,,NORWOOD,MA,2062,52349.0,42.331319,-71.03592,"(42.33131949281363, -71.0359204565752)"
4,5,V876048,1013,2025-12-04 14:09:47,Open,116.0,,Unsafe Structures,74,,...,18,33 MAPLE STREET #206,,MALDEN,MA,2148,15648.0,42.26704,-71.109911,"(42.26703959975603, -71.10991076751128)"


In [2]:
#filter by years intersted
years_interest = ["2024", "2023", "2022", "2021", "2020", "2019", "2018"]

pattern = '|'.join(years_interest)
df_years = df[df['status_dttm'].str.contains(pattern, case=False, na=False)]
display(df_years)

Unnamed: 0,_id,case_no,ap_case_defn_key,status_dttm,status,code,value,description,violation_stno,violation_sthigh,...,ward,contact_addr1,contact_addr2,contact_city,contact_state,contact_zip,sam_id,latitude,longitude,location
709,710,V762114,1013,2024-12-31 10:59:36,Open,102.8,,Maintenance,14,,...,18,14 MT CALVARY RD,,ROSLINDALE,MA,02131,98107.0,42.280870,-71.113141,"(42.28086960340802, -71.11314074390258)"
710,711,V760019,1013,2024-12-31 10:46:19,Closed,105.1,,Failure to Obtain Permit,614,,...,09,64 Parker St,,Newton,MA,02459,92493.0,42.337200,-71.077451,"(42.33719954706552, -71.07745053724503)"
711,712,V807184,1013,2024-12-30 10:19:11,Closed,102.8,,Maintenance,51,,...,11,51 BEECH GLEN,,ROXBURY,MA,02119,13344.0,42.324460,-71.095251,"(42.32445957209921, -71.09525060592132)"
712,713,V806319,1013,2024-12-26 08:23:23,Closed,105.1,,Failure to Obtain Permit,1162,,...,17,1162 Morton ST,,MATTAPAN,MA,02126,97659.0,42.275810,-71.073231,"(42.27580954746868, -71.0732306647095)"
713,714,V805052,1013,2024-12-20 13:47:15,Closed,102.8,,Maintenance,1,5,...,21,C/O O'BRIENS PUB/GAIL ATKINSON,3 HARVARD AVE,BOSTON,MA,02134,156445.0,42.355261,-71.132901,"(42.3552614972918, -71.13290137350627)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,6250,V378763,1013,2018-01-03 09:46:38,Closed,105.1,,Failure to Obtain Permit,11,,...,20,11 WILLERS ST,,WEST ROXBURY,MA,02132,149770.0,42.259200,-71.150941,"(42.259199659774055, -71.15094087690251)"
6250,6251,V378751,1013,2018-01-03 09:04:18,Closed,105.1,,Failure to Obtain Permit,6,,...,14,C/O MONIQUE NJAMO ENOW,6 MALLON ROAD,DORCHESTER,MA,02121,88816.0,42.302650,-71.073971,"(42.302649545798204, -71.07397060631675)"
6251,6252,V378733,1013,2018-01-03 08:49:54,Closed,1003.6,,Means of Egress,58,,...,01,58 TRENTON ST,,EAST BOSTON,MA,02128,138380.0,42.378696,-71.037367,"(42.378696491815774, -71.03736735503051)"
6252,6253,V378660,1013,2018-01-02 15:57:02,Closed,105.1,,Failure to Obtain Permit,8,,...,14,8 STANWOOD ST,,DORCHESTER,MA,02121,130309.0,42.310140,-71.081381,"(42.31013955488775, -71.08138060662927)"


In [3]:
#remove columns with contact address - we are only interested in addresses of where the violations occur 

df_new = df.drop(['contact_addr1', 'contact_addr2', "contact_city", "contact_state", "contact_zip", "sam_id", "latitude", "longitude", "location"], axis=1) 

df_new.head() 

Unnamed: 0,_id,case_no,ap_case_defn_key,status_dttm,status,code,value,description,violation_stno,violation_sthigh,violation_street,violation_suffix,violation_city,violation_state,violation_zip,ward
0,1,V91983,1013,,Closed,121.2,,Unsafe and Dangerous,302,,Sumner,ST,East Boston,MA,2128,1
1,2,V876069,1013,2025-12-04 15:48:22,Open,102.8,,Maintenance,46,,Adams,ST,Dorchester,MA,2122,15
2,3,V876068,1013,2025-12-04 15:27:46,Open,102.8,,Maintenance,70,,Birchcroft,RD,Mattapan,MA,2136,18
3,4,V876066,1013,2025-12-04 15:04:58,Open,102.8,,Maintenance,633,,Eighth,ST,South Boston,MA,2127,7
4,5,V876048,1013,2025-12-04 14:09:47,Open,116.0,,Unsafe Structures,74,,Birchcroft,RD,Mattapan,MA,2136,18


In [4]:
df_new['full_address'] = df_new['violation_stno'].str.cat(df_new['violation_street'], sep=' ').str.cat(df_new['violation_suffix'], sep=' ').str.cat(df_new['violation_city'], sep=' ').str.cat(df_new['violation_zip'], sep=' ')

In [5]:
df_new = df_new.drop(['violation_stno', 'violation_suffix', 'violation_street', 'violation_city', 'violation_state', 'violation_zip'], axis=1)

df_new.head()

Unnamed: 0,_id,case_no,ap_case_defn_key,status_dttm,status,code,value,description,violation_sthigh,ward,full_address
0,1,V91983,1013,,Closed,121.2,,Unsafe and Dangerous,,1,302 Sumner ST East Boston 02128
1,2,V876069,1013,2025-12-04 15:48:22,Open,102.8,,Maintenance,,15,46 Adams ST Dorchester 02122
2,3,V876068,1013,2025-12-04 15:27:46,Open,102.8,,Maintenance,,18,70 Birchcroft RD Mattapan 02136
3,4,V876066,1013,2025-12-04 15:04:58,Open,102.8,,Maintenance,,7,633 Eighth ST South Boston 02127
4,5,V876048,1013,2025-12-04 14:09:47,Open,116.0,,Unsafe Structures,,18,74 Birchcroft RD Mattapan 02136


In [6]:
df_new.to_csv("cleaned_building_and_property_violations.csv", index=False)