In [None]:
# load dataset
df = pd.read_csv('../data/df_lownull.csv', parse_dates=True)
df = df.drop(columns=['Unnamed: 0'])

# clean column names (lowercase and remove spaces)
df.columns = [col.lower().replace(' ', '').replace('/', '_') for col in df.columns]

# clean and convert columns
df['annual$'] = df['annual$'].str.replace('$', '').str.replace(',', '').astype(float)
df['date'] = df['date'].astype('datetime64[ns]')
df['sale_ask'] = df['sale_ask'].astype('float')

# clean the 'sf' column before filtering
df['sf'] = (
    df['sf']
    .str.replace(',', '')             # remove commas
    .str.replace(' \t-  ', '')        # remove tabs
    .str.replace(' ', '')             # remove spaces
    .str.replace('$', '')             # remove $ signs
    .str.replace('sf', '')            # remove extra letters
    .replace('', np.nan)              # replace empty strings with nan
)
df['sf'] = df['sf'].astype('float')  # convert 'sf' to float

# drop rows with nan values in specific columns
df = df.dropna(subset=['date', 'sale_ask', 'units', 'floors', 'sf', 'annual$'])

# drop rows where any of 'sf', 'units', or 'floors' equals zero
df = df[~(df[['sf', 'units', 'floors']] == 0).any(axis=1)]

# update and convert remaining columns
df.at[12080, 'units'] = 1
df['units'] = df['units'].astype('float')
df['floors'] = df['floors'].astype('float')

# check results
print(df.shape)
df[df['sf'] == 0]  # should return an empty dataframe


In [None]:
sales_per_month = df.groupby(['year', 'month']).size()
sales_per_month = sales_per_month.reset_index(name='count')
sales_per_month.columns = ['year', 'month', 'count']
sales_per_month['year_month'] = sales_per_month['year'].astype(str) + '-' + sales_per_month['month'].astype(str).str.zfill(2)

fig, ax = plt.subplots(figsize=(10,6))
ax.bar(sales_per_month['year_month'], 
       sales_per_month['count'],
       color='coral'
)

ax.set_xlabel('Year-Month')
ax.set_ylabel('Count')
ax.set_title('Sales Per Month')
plt.tight_layout()

# show only 6-month interval labels for ease of viewing
tick_positions = range(len(sales_per_month))
tick_labels = [
    label if month in [1] else "" # show label only for January and June
    for label, month in zip(sales_per_month['year_month'], sales_per_month['month'])
]
ax.set_xticks(tick_positions)
ax.set_xticklabels(tick_labels, rotation=45, ha='right')

# add year vertical lines
start_of_year_indices = sales_per_month[sales_per_month['month'] == 1].index
for idx in start_of_year_indices:
    ax.axvline(x=idx, color='cornflowerblue', linestyle='-', linewidth=0.5)

plt.show()