In [1]:
# Import libraries here
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

In [None]:
%load_ext sql
%sql sqlite:////home/jovyan/nepal.sqlite

In [None]:
%%sql
SELECT DISTINCT(district_id)
FROM id_map
LIMIT 5

In [None]:
%%sql
SELECT COUNT(*)
FROM id_map
WHERE district_id = 1

In [None]:
%%sql
SELECT COUNT(*)
FROM id_map
WHERE district_id = 3

In [None]:
%%sql
SELECT DISTINCT(im.building_id) AS b_id,
    bs.*,
    bd.damage_grade
FROM id_map AS im
JOIN building_structure AS bs ON im.building_id = bs.building_id
JOIN building_damage AS bd ON im.building_id = bd.building_id
WHERE district_id = 3
LIMIT 5

In [None]:
# Build your `wrangle` function here
def wrangle(dbpath):
    conn = sqlite3.connect(dbpath)
    
    query = '''
    SELECT DISTINCT(im.building_id) AS b_id,
        bs.*,
        bd.damage_grade
    FROM id_map AS im
    JOIN building_structure AS bs ON im.building_id = bs.building_id
    JOIN building_damage AS bd ON im.building_id = bd.building_id
    WHERE district_id = 3
    '''
    
    df = pd.read_sql(query, conn, index_col='b_id')
    
    drop_cols = [col for col in df.columns if 'post_eq' in col]
    
    drop_cols.append('building_id')
    
    df['damage_grade'] = df['damage_grade'].str[-1].astype(int)
    df['severe_damage'] = (df['damage_grade'] > 3).astype(int)
    
    drop_cols.append('damage_grade')
    drop_cols.append('count_floors_pre_eq')
    
    df.drop(columns=drop_cols, inplace=True)
    
    return df

In [None]:
df = wrangle('/home/jovyan/nepal.sqlite')
df.head()

In [None]:
# Plot value counts of `"severe_damage"`
df['severe_damage'].value_counts(normalize=True).plot(kind='bar')
plt.xlabel('Severe Damage')
plt.ylabel('Relative Frequency')
plt.title('Kavrepalanchok, Class Balance')

# Don't delete the code below 👇
plt.savefig("images/4-5-6.png", dpi=150)


In [None]:
sns.boxplot(data=df, x='severe_damage', y='plinth_area_sq_ft') 
plt.xlabel('Severe Damage')
plt.ylabel('Plinth Area [sq. ft.]')
plt.title('Kavrepalanchok, Plinth Area vs Building Damage')
# Don't delete the code below 👇
plt.savefig("images/4-5-7.png", dpi=150)


In [None]:
roof_pivot = pd.pivot_table(df, values='severe_damage', index='roof_type', aggfunc=np.mean)
roof_pivot

In [None]:
target = 'severe_damage'
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

In [None]:
df.select_dtypes(exclude='int').nunique()

In [None]:
model_lr = Pipeline([('encoder', OneHotEncoder(use_cat_names=True)), ('regressor', LogisticRegression(solver='newton-cg'))])
model_lr.fit(X_train, y_train)

In [None]:
lr_train_acc = accuracy_score(y_train, model_lr.predict(X_train))
lr_val_acc = accuracy_score(y_val, model_lr.predict(X_val))

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_val_acc)

In [None]:
depth_hyperparams = range(1, 16)
training_acc = []
validation_acc = []
for d in depth_hyperparams:
    model_dt = Pipeline([('encoder', OrdinalEncoder()), ('classifier', DecisionTreeClassifier(max_depth=d, random_state=42))])
    model_dt.fit(X_train, y_train)
    training_acc.append(accuracy_score(y_train, model_dt.predict(X_train)))
    validation_acc.append(accuracy_score(y_val, model_dt.predict(X_val)))
    

In [None]:
validation_df = pd.DataFrame(
    {'training_acc':training_acc,
    'validation_acc':validation_acc}, index=depth_hyperparams
)
validation_df

In [None]:
plt.plot(validation_df.values)
plt.xlabel('Max Depth')
plt.ylabel('Accuracy Score')
plt.title('Validation Curve, Decision Tree Model')
# Don't delete the code below 👇
plt.savefig("images/4-5-15.png", dpi=150)

In [None]:
validation_df['validation_acc'].max()

In [None]:
final_model_dt = Pipeline([('encoder', OrdinalEncoder()), ('classifier', DecisionTreeClassifier(max_depth=10, random_state=42))])
final_model_dt.fit(X_train, y_train)

In [None]:
X_test = pd.read_csv("data/kavrepalanchok-test-features.csv", index_col="b_id")
y_test_pred = final_model_dt.predict(X_test)
y_test_pred[:5]

In [None]:
feat_names = final_model_dt[1].feature_names_in_
feat_imp = pd.Series(final_model_dt[1].feature_importances_, index=feat_names)
feat_imp.head()

In [None]:
# Create horizontal bar chart of feature importances
plt.barh(feat_imp.index, feat_imp.values)

# Don't delete the code below 👇
plt.tight_layout()
plt.savefig("images/4-5-19.png", dpi=150)
