In [1]:
# Initial imports
import pandas as pd
from pathlib import Path

#regex library has functionality beyond the built-in Python module re
import regex as re

## Loading and Preprocessing Loans Encoded Data



In [2]:
# Loading data
bridge_data = pd.read_csv("Resources/NTAD_National_Bridge_Inventory.csv", low_memory=False)
bridge_data.head()

Unnamed: 0,OBJECTID,STATE_CODE_001,STRUCTURE_NUMBER_008,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,ROUTE_NUMBER_005D,DIRECTION_005E,HIGHWAY_DISTRICT_002,COUNTY_CODE_003,...,SUBMITTED_BY,BRIDGE_CONDITION,LOWEST_RATING,DECK_AREA,STATUS,DATE,LATDD,LONGDD,x,y
0,1,1,00000000000S702,1,6,1,4007,0,9,53,...,62,F,5,145.5,ST,723,31.103039,-87.570411,-87.570411,31.103039
1,2,1,00000000000S703,1,6,1,4003,0,9,53,...,62,F,6,133.28,ST,723,31.105611,-87.569139,-87.569139,31.105611
2,3,1,0000000000M0022,1,8,1,0,0,0,113,...,74,F,5,387.0,ST,723,32.295361,-84.977219,-84.977219,32.295361
3,4,1,000000883039900,1,4,1,88,0,2,59,...,67,G,7,687.75,AM,1202,34.454228,-87.975517,-87.975517,34.454228
4,5,1,000001014002450,1,3,1,101,0,2,79,...,67,F,5,15453.36,MM,403,34.813317,-87.382372,-87.382372,34.813317


In [3]:
# create copy to clean file for SQL import
bridge_data_clean = bridge_data.copy()

In [4]:
# Define a function to replace non-values like 'N' or 'n' with an empty string using regex.fullmatch
def replace_with_null(value):
    if re.fullmatch(r'[/*]', str(value)):
        return ''
    elif re.fullmatch(r'[Nn]',str(value)):
        return ''
    else:
        return value

# List the dataset columns that have encoded non-values    
series_list = ['RAILINGS_036A','TRANSITIONS_036B','APPR_RAIL_036C','APPR_RAIL_END_036D',
               'NAVIGATION_038','DECK_COND_058','SUPERSTRUCTURE_COND_059',
               'SUBSTRUCTURE_COND_060','CHANNEL_COND_061','CULVERT_COND_062',
               'STRUCTURAL_EVAL_067','DECK_GEOMETRY_EVAL_068','UNDCLRENCE_EVAL_069',
               'WATERWAY_EVAL_071','DECK_STRUCTURE_TYPE_107','SURFACE_TYPE_108A',
               'MEMBRANE_TYPE_108B', 'DECK_PROTECTION_108C'
              ]

# remove non-values from the dataset
bridge_data_clean[series_list] = bridge_data_clean[series_list].map(replace_with_null)

In [5]:
# export clean df
bridge_data_clean.to_csv("output_data/bridge_data_clean.csv", index=False, lineterminator=None,sep=',')

In [6]:
# create version of df with columns in lowercase for SQL import
bridge_data_clean.columns = [x.lower() for x in bridge_data_clean.columns]
print(bridge_data_clean.columns)

Index(['objectid', 'state_code_001', 'structure_number_008',
       'record_type_005a', 'route_prefix_005b', 'service_level_005c',
       'route_number_005d', 'direction_005e', 'highway_district_002',
       'county_code_003',
       ...
       'submitted_by', 'bridge_condition', 'lowest_rating', 'deck_area',
       'status', 'date', 'latdd', 'longdd', 'x', 'y'],
      dtype='object', length=130)


In [7]:
# export clean df
bridge_data_clean.to_csv("output_data/bridge_data_clean.csv", index=False, lineterminator=None,sep=',')

In [8]:
bridge_data.info()
bridge_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621581 entries, 0 to 621580
Columns: 130 entries, OBJECTID to y
dtypes: float64(46), int64(41), object(43)
memory usage: 616.5+ MB


Unnamed: 0,OBJECTID,STATE_CODE_001,RECORD_TYPE_005A,ROUTE_PREFIX_005B,SERVICE_LEVEL_005C,DIRECTION_005E,COUNTY_CODE_003,PLACE_CODE_004,CRITICAL_FACILITY_006B,MIN_VERT_CLR_010,...,YEAR_OF_FUTURE_ADT_115,MIN_NAV_CLR_MT_116,SUBMITTED_BY,LOWEST_RATING,DECK_AREA,DATE,LATDD,LONGDD,x,y
count,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0,0.0,621581.0,...,621581.0,269718.0,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0,621581.0
mean,310791.0,30.017867,1.0,3.564269,1.160912,0.221323,94.867071,21075.474754,,93.612089,...,2018.016585,0.04092,30.690039,6.208558,649.132506,777.806148,37.902957,-91.752495,-91.752495,37.902957
std,179435.123173,15.374181,0.0,1.375336,1.25633,0.675185,90.471975,28851.633603,,21.185916,...,180.48418,3.973447,15.930257,1.161318,2559.737351,294.600972,5.048528,13.059281,13.059281,5.048528
min,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,,0.0,...,0.0,0.0,1.0,0.0,1.92,203.0,0.0,-174.182281,-174.182281,0.0
25%,155396.0,18.0,1.0,3.0,1.0,0.0,33.0,0.0,,99.99,...,2034.0,0.0,18.0,5.0,94.72,603.0,34.409225,-97.15095,-97.15095,34.409225
50%,310791.0,29.0,1.0,4.0,1.0,0.0,77.0,0.0,,99.99,...,2040.0,0.0,30.0,6.0,212.48,723.0,38.487453,-90.234969,-90.234969,38.487453
75%,466186.0,42.0,1.0,4.0,1.0,0.0,127.0,41000.0,,99.99,...,2041.0,0.0,45.0,7.0,589.26,1202.0,41.316875,-83.006944,-83.006944,41.316875
max,621581.0,78.0,1.0,8.0,8.0,4.0,840.0,99999.0,,99.99,...,5038.0,999.9,79.0,9.0,372399.36,1202.0,70.21645,144.820158,144.820158,70.21645


In [9]:
print(bridge_data_clean.columns.tolist())

['objectid', 'state_code_001', 'structure_number_008', 'record_type_005a', 'route_prefix_005b', 'service_level_005c', 'route_number_005d', 'direction_005e', 'highway_district_002', 'county_code_003', 'place_code_004', 'features_desc_006a', 'critical_facility_006b', 'facility_carried_007', 'location_009', 'min_vert_clr_010', 'kilopoint_011', 'base_hwy_network_012', 'lrs_inv_route_013a', 'subroute_no_013b', 'lat_016', 'long_017', 'detour_kilos_019', 'toll_020', 'maintenance_021', 'owner_022', 'functional_class_026', 'year_built_027', 'traffic_lanes_on_028a', 'traffic_lanes_und_028b', 'adt_029', 'year_adt_030', 'design_load_031', 'appr_width_mt_032', 'median_code_033', 'degrees_skew_034', 'structure_flared_035', 'railings_036a', 'transitions_036b', 'appr_rail_036c', 'appr_rail_end_036d', 'history_037', 'navigation_038', 'nav_vert_clr_mt_039', 'nav_horr_clr_mt_040', 'open_closed_posted_041', 'service_on_042a', 'service_und_042b', 'structure_kind_043a', 'structure_type_043b', 'appr_kind_044

In [10]:
print(bridge_data_clean['year_reconstructed_106'])

0            0.0
1            0.0
2            0.0
3            0.0
4         1962.0
           ...  
621576       NaN
621577       NaN
621578       NaN
621579       NaN
621580       NaN
Name: year_reconstructed_106, Length: 621581, dtype: float64
