In [None]:
import zipfile
import os

# Define paths
raw_data_path = "data/raw/LBSM_All_London.zip"
extract_dir = "data/raw/"

# Unzip the file
with zipfile.ZipFile(raw_data_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Unzipped to: {extract_dir}")

In [None]:
import pandas as pd

# Define the CSV path
csv_path = os.path.join(extract_dir, "LBSM_All_London.csv")

# Load data (adjust chunksize if memory issues)
try:
    df = pd.read_csv(csv_path)
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error: {e}")

In [None]:
df.describe()

In [None]:
df.isnull().sum()
(df.isnull().sum() / len(df)) * 100



In [None]:
df = df.drop_duplicates()
high_missing_cols = [
    'BASEMENT_FLOOR',  # 93.1% missing
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BY_COUNT',  # 99.6% missing
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BAND_BY_COUNT',
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BY_FLOORAREA',
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BAND_BY_FLOORAREA',
    'DOMINANT_ND_ACTIVITY_BY_C2_FS',  # 85.6% missing
    'DOMINANT_ND_ACTIVITY_BY_C2_COUNT'  # 84.7% missing
]
df = df.drop(columns=high_missing_cols )



In [None]:
if df is None:
    print("DataFrame is not loaded properly!")
else:
    print("DataFrame loaded successfully!")


In [None]:
df.head(5)  # View the first few rows
df.info()  # View info about the DataFrame


In [None]:
numeric_cols = ['ESTIMATED_FLOOR_COUNT', 'ESTIMATE_TOTAL_FLOOR_AREA_ALL', 'MEAN_OBJECT_HEIGHT_M']
for col in numeric_cols:
    if col in df.columns:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
    else:
        print(f"Column {col} not found in the DataFrame")

# For EPC data: Drop missing rows (if critical) OR impute
df = df.dropna(subset=['AGG_EPC_CURRENT_ENERGY_EFFICIENCY_BY_COUNT'])

In [None]:
print(df.columns.tolist())
# Drop negligible missing rows for OA/LSOA/MSOA

df = df.drop(columns=['SCU_ID', 'COUNT_D_UPRNS', 'COUNT_ND_UPRNS'] )

In [None]:
df = df.dropna(subset=['OA', 'LSOA', 'MSOA'])

In [None]:
# Check remaining missing values
missing_percent = (df.isnull().sum() / len(df)) * 100
print("Remaining Missing Values (%):")
print(missing_percent[missing_percent > 0])

# Save cleaned data


In [32]:
cols_to_dropna = [
    'MIX_CLASS',
    'AGG_EPC_POTENTIAL_ENERGY_EFFICIENCY_BY_COUNT',
    'AGG_EPC_POTENTIAL_ENERGY_RATING_BY_COUNT',
    'AGG_EPC_POTENTIAL_ENERGY_EFFICIENCY_BY_FLOORAREA',
    'AGG_EPC_POTENTIAL_ENERGY_RATING_BY_FLOORAREA'
]
df = df.dropna(subset=cols_to_dropna )

In [33]:
cols_to_dropna_tiny = [
    'AGG_EPC_CURRENT_ENERGY_EFFICIENCY_BY_FLOORAREA',
    'AGG_EPC_CURRENT_ENERGY_RATING_BY_FLOORAREA',
    'AGG_RVEPC_CURRENT_ENERGY_EFFICIENCY_BY_FLOORAREA',
    'AGG_RVEPC_CURRENT_ENERGY_RATING_BY_FLOORAREA'
]
df = df.dropna(subset=cols_to_dropna_tiny)

In [None]:
missing_percent = (df.isnull().sum() / len(df)) * 100
print("Remaining Missing Values (%):")
print(missing_percent[missing_percent > 0])

In [35]:
df = df.reset_index(drop=True)

In [36]:
df.to_parquet("data/processed/lbsm_clean.parquet")

In [None]:
print(df.columns.tolist())
     

In [None]:
df.columns_to_drop = [
    'OA', 'LSOA', 'MSOA', 'OAC',  # Census codes (not critical for buildings)
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BY_COUNT', 
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BAND_BY_COUNT',
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BY_FLOORAREA',
    'AGG_DEC_CURRENT_OPERATIONAL_RATING_BAND_BY_FLOORAREA',
    'DEC_COUNT'  # DEC data (less critical)
]


In [None]:
df.dtypes



In [40]:
# List of columns that should be numeric
columns_to_convert = [
    'AGG_EPC_CURRENT_ENERGY_RATING_BY_COUNT', 
    'AGG_EPC_CURRENT_ENERGY_RATING_BY_FLOORAREA',
    'AGG_EPC_POTENTIAL_ENERGY_RATING_BY_COUNT', 
    'AGG_EPC_POTENTIAL_ENERGY_RATING_BY_FLOORAREA',
    'AGG_RVEPC_CURRENT_ENERGY_RATING_BY_COUNT',
    'AGG_RVEPC_CURRENT_ENERGY_RATING_BY_FLOORAREA'
]

# Convert to numeric, forcing errors to NaN
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')


In [None]:
# Check missing values
print(df.isnull().sum())
