Earthquake Damage in Kavrepalanchok NP

In [None]:
# Import libraries here
import sqlite3
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)

Prepara data

Connect

In [None]:
%load_ext sql
%sql sqlite:////home/jovyan/nepal.sqlite

In [None]:
%%sql
SELECT *
FROM sqlite_schema

In [None]:
%%sql
SELECT name
FROM sqlite_schema
WHERE type= "table"

In [None]:
# What districts are represented in the id_map table? Determine the unique values in the district_id column.
%%sql
SELECT distinct(district_id)
FROM id_map

In [None]:
# Calculate the number of observations in the id_map table associated with district 
%%sql
SELECT count(*)
FROM id_map
WHERE district_id = 1
Limit 5

In [None]:
# Calculate the number of observations in the id_map table associated with district 3
%%sql
SELECT count(*)
FROM id_map
WHERE district_id = 3

In [None]:
%%sql
SELECT *
FROM building_damage
LIMIT 5

In [None]:
#Join the unique building IDs from Kavrepalanchok in id_map, all the columns from building_structure, and the damage_grade column from building_damage, limiting. Make sure you rename the building_id column in id_map as b_id and limit your results to the first five rows of the new table.
%%sql
SELECT distinct(i.building_id) AS b_id,
    s.*,
    d.damage_grade
FROM id_map AS i
JOIN building_structure AS s ON i.building_id=s.building_id
JOIN building_damage AS d ON i.building_id=d.building_id
WHERE district_id = 3
LIMIT 5

In [None]:
#Write a wrangle function that will use the query you created in the previous task to create a DataFrame. In addition your function should: Create a "severe_damage" column, where all buildings with a damage grade greater than 3 should be encoded as 1. All other buildings should be encoded at 0.
#Drop any columns that could cause issues with leakage or multicollinearity in your model. Build your `wrangle` function here
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)
    # Construct query
    query = """
   SELECT distinct(i.building_id) AS b_id,
    s.*,
    d.damage_grade
FROM id_map AS i
JOIN building_structure AS s ON i.building_id=s.building_id
JOIN building_damage AS d ON i.building_id=d.building_id
WHERE district_id = 3
                """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col= "b_id")
    
    #Create binary target column
    df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
    df["severe_damage"]=(df["damage_grade"]>3).astype(int)
    # identify leaky column
    drop_cols=[col for col in df.columns if  "post_eq" in col]
    
    #drop old column
    drop_cols.append("damage_grade")
    
    #drop multicollinerity column
    drop_cols.append("count_floors_pre_eq")
    
    #drop high-cardinality catagorical column
    drop_cols.append("building_id")
    
    #Drop column
    df.drop(columns=drop_cols, inplace=True)
    
    return df

In [None]:
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

In [None]:
df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
df["severe_damage"]=(df["severe_damage"]>3).astype(int)

In [None]:
correlation = df.select_dtypes("number").drop(columns="severe_damage").corr()
correlation
# Plot heatmap of `correlation`
sns.heatmap(correlation)

Explore

In [None]:
#Are the classes in this dataset balanced? Create a bar chart with the normalized value counts from the "severe_damage" column. Be sure to label the x-axis "Severe Damage" and the y-axis "Relative Frequency". Use the title "Kavrepalanchok, Class Balance"
# Plot value counts of `"severe_damage"`
df["severe_damage"].value_counts(normalize=True).plot(kind="bar")
plt.xlabel("Severe Damage")
plt.ylabel("Relative Frequency")
plt.title ("Kavrepalanchok, Class Balance")
# Don't delete the code below 👇
plt.savefig("images/4-5-6.png", dpi=150)


In [None]:
#Is there a relationship between the footprint size of a building and the damage it sustained in the earthquake? Use seaborn to create a boxplot that shows the distributions of the "plinth_area_sq_ft" column for both groups in the "severe_damage" column. Label your x-axis "Severe Damage" and y-axis "Plinth Area [sq. ft.]". Use the title "Kavrepalanchok, Plinth Area vs Building Damage"
sns.boxplot(x=df["severe_damage"], y=df["plinth_area_sq_ft"])
plt.xlabel("severe_damage")
plt.ylabel("Plinth_area_sq_ft")
plt.title("Karepalanchok, Plinth Area vs Building Damage")
# Don't delete the code below 👇
plt.savefig("images/4-5-7.png", dpi=150)


In [None]:
#Are buildings with certain roof types more likely to suffer severe damage? Create a pivot table of df where the index is "roof_type" and the values come from the "severe_damage" column, aggregated by the mean
roof_pivot = pd.pivot_table(data=df, index="roof_type", values="severe_damage", aggfunc="mean")
roof_pivot

Split


In [None]:
#Create your feature matrix X and target vector y. Your target is "severe_damage"
target="severe_damage"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
#Divide your dataset into training and validation sets using a randomized split. Your validation set should be 20% of your data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
#Calculate the baseline accuracy score for your model
acc_baseline =y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

Iterate


In [None]:
#Create a model model_lr that uses logistic regression to predict building damage. Be sure to include an appropriate encoder for categorical features
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True), LogisticRegression()
)
model_lr.fit(X_train, y_train)

In [None]:
# Calculate training and validation accuracy score for model_lr
lr_train_acc = accuracy_score(y_train, model_lr.predict(X_train))
lr_val_acc = model_lr.score(X_val, y_val)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_val_acc)

In [None]:
#Perhaps a decision tree model will perform better than logistic regression, but what's the best hyperparameter value for max_depth? Create a for loop to train and evaluate the model model_dt at all depths from 1 to 15. Be sure to use an appropriate encoder for your model, and to record its training and validation accuracy scores at every depth. The grader will evaluate your validation accuracy scores only
depth_hyperparams = range(1, 16)
training_acc = []
validation_acc = []
for d in depth_hyperparams:
    model_dt = make_pipeline(
        OrdinalEncoder(), 
        DecisionTreeClassifier(max_depth=d,random_state=42)
    )
    model_dt.fit(X_train, y_train)
    # Calculate training accuracy score and append to `training_acc`
    training_acc.append(model_dt.score(X_train, y_train))
    # Calculate validation accuracy score and append to `training_acc`
    validation_acc.append(model_dt.score(X_val, y_val))

print("Training Accuracy Scores:", training_acc[:6])
print("Validation Accuracy Scores:", validation_acc[:6])

In [None]:
#Using the values in training_acc and validation_acc, plot the validation curve for model_dt. Label your x-axis "Max Depth" and your y-axis "Accuracy Score". Use the title "Validation Curve, Decision Tree Model", and include a legend.
plt.plot(depth_hyperparams, training_acc, label="training")
plt.plot(depth_hyperparams, validation_acc, label="validation")
plt.xlabel("Max depth")
plt.ylabel("accuracy score")
plt.legend()
# Don't delete the code below 👇
plt.savefig("images/4-5-15.png", dpi=150)


In [None]:
#Build and train a new decision tree model final_model_dt, using the value for max_depth that yielded the best validation accuracy score in your plot above
final_model_dt =make_pipeline(
    OrdinalEncoder(), DecisionTreeClassifier(max_depth=10,random_state=42)
)
# Fit model to training data
final_model_dt.fit(X_val, y_val)


Evaluate

In [None]:
#How does your model perform on the test set? First, read the CSV file "data/kavrepalanchok-test-features.csv" into the DataFrame X_test. Next, use final_model_dt to generate a list of test predictions y_test_pred. Finally, submit your test predictions to the grader to see how your model performs
X_test = pd.read_csv("data/kavrepalanchok-test-features.csv", index_col="b_id")
y_test_pred = final_model_dt.predict(X_test)
y_test_pred[:5]

In [None]:
What are the most important features for final_model_dt? Create a Series Gini feat_imp, where the index labels are the feature names for your dataset and the values are the feature importances for your model. Be sure that the Series is sorted from smallest to largest feature importance
features = X_train.columns
importances = final_model_dt.named_steps["decisiontreeclassifier"].feature_importances_
feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.head()

In [None]:
# Create horizontal bar chart of feature importances
feat_imp.plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Label")
plt.title("Kavrepalanchok Decision Tree, Feature Importance")
# Don't delete the code below 👇
plt.tight_layout()
plt.savefig("images/4-5-19.png", dpi=150)