Predicting Damage with Logistic Regression


In [None]:
import sqlite3
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from IPython.display import VimeoVideo
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)

Prepare Data

Import

In [None]:
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")
    
    
    return df

In [None]:
#Complete the wrangle function above so that the it returns the results of query as a DataFrame. Be sure that the index column is set to "b_id". Also, the path to the SQLite database is "/home/jovyan/nepal.sqlite"
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

In [None]:
#Add to your wrangle function so that these features are dropped from the DataFrame. Don't forget to rerun all the cells above
drop_cols=[]
for col in df.columns:
    if "post_eq" in col:
        drop_cols.append(col)
drop_cols 

In [None]:
drop_cols=[col for col in df.columns if  "post_eq" in col]
drop_cols

In [None]:
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")
    
    #identify leaky columns
    drop_cols=[col for col in df.columns if  "post_eq" in col]
    
    #drop columns
    df.drop(columns=drop_cols , inplace= True)
    
    return df

In [None]:
#Complete the wrangle function above so that the it returns the results of query as a DataFrame. Be sure that the index column is set to "b_id". Also, the path to the SQLite database is "/home/jovyan/nepal.sqlite"
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

In [None]:
#Add to your wrangle function so that it creates a new target column "severe_damage". For buildings where the "damage_grade" is Grade 4 or above, "severe_damage" should be 1. For all other buildings, "severe_damage" should be 0. Don't forget to drop "damage_grade" to avoid leakage, and rerun all the cells above
df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
df["severe_damage"]=(df["damage_grade"]>3).astype(int) 

In [None]:
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")
    
    #identify leaky columns
    drop_cols=[col for col in df.columns if  "post_eq" in col]
    
    # Create binary target
    df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
    df["severe_damage"]=(df["damage_grade"]>3).astype(int)   
    
    #Drop old target
    drop_cols.append("damage_grade")
    
    #drop columns
    df.drop(columns=drop_cols , inplace= True)
    
    return df

In [None]:
#Complete the wrangle function above so that the it returns the results of query as a DataFrame. Be sure that the index column is set to "b_id". Also, the path to the SQLite database is "/home/jovyan/nepal.sqlite"
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

Explore


In [None]:
df["severe_damage"].corr(df["count_floors_pre_eq"])

In [None]:
df["severe_damage"].corr(df["height_ft_pre_eq"])

In [None]:
#Plot a correlation heatmap of the remaining numerical features in df. Since "severe_damage" will be your target, you don't need to include it in your heatmap
# Create correlation matrix
correlation = df.select_dtypes("number").drop(columns="severe_damage").corr()
correlation
# Plot heatmap of `correlation`
sns.heatmap(correlation)

In [None]:
# Change wrangle function so that it drops the "count_floors_pre_eq" column. Don't forget to rerun all the cells above
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")
    
    #identify leaky columns
    drop_cols=[col for col in df.columns if  "post_eq" in col]
    
    # Create binary target
    df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
    df["severe_damage"]=(df["damage_grade"]>3).astype(int)   
    
    #Drop old target
    drop_cols.append("damage_grade")
    
    #Drop multicollinearlity columns
    drop_cols,append("count_floors_pre_eq")
    
    #drop columns
    df.drop(columns=drop_cols , inplace= True)
    
    return df

In [None]:
#Complete the wrangle function above so that the it returns the results of query as a DataFrame. Be sure that the index column is set to "b_id". Also, the path to the SQLite database is "/home/jovyan/nepal.sqlite"
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

In [None]:
#Use seaborn to create a boxplot that shows the distributions of the "height_ft_pre_eq" column for both groups in the "severe_damage" column. Remember to label your axes
# Create boxplot
sns.boxplot(x="severe_damage",y="height_ft_pre_eq", data=df)
# Label axes
plt.xlabel("severe damage")
plt.ylabel("height Pre-earthquake [ft.]")
plt.title("Distribution of building height by class")

In [None]:
#Create a bar chart of the value counts for the "severe_damage" column. You want to calculate the relative frequencies of the classes, not the raw count, so be sure to set the normalize argument to True
# Plot value counts of `"severe_damage"`
df["severe_damage"].value_counts(normalize=True).plot(kind="bar", xlabel="class", y="Relative frequency", title="class balance")

In [None]:
#Create two variables, majority_class_prop and minority_class_prop, to store the normalized value counts for the two classes in df["severe_damage"]
majority_class_prop, minority_class_prop = df["severe_damage"].value_counts(normalize=True)
print(majority_class_prop, minority_class_prop)

In [None]:
# Are buildings with certain foundation types more likely to suffer severe damage? Create a pivot table of df where the index is "foundation_type" and the values come from the "severe_damage" column, aggregated by the mean
# Create pivot table
foundation_pivot = pd.pivot_table(
    df, index= "foundation_type", values="severe_damage", aggfunc=np.mean).sort_values(by="severe_damage")
foundation_pivot

In [None]:
#How do the proportions in foundation_pivot compare to the proportions for our majority and minority classes? Plot foundation_pivot as horizontal bar chart, adding vertical lines at the values for majority_class_prop and minority_class_prop
# Plot bar chart of `foundation_pivot`
foundation_pivot.plot(kind="barh", legend=None)
plt.axvline(
majority_class_prop, linestyle="--", color="red", label="majority class")
plt.axvline(
minority_class_prop, linestyle="--", color="green", label="minority class")
plt.legend(loc="lower right")

In [None]:
#Combine the select_dtypes and nunique methods to see if there are any high- or low-cardinality categorical features in the dataset.
# Check for high- and low-cardinality categorical features
df.select_dtypes("object").nunique()

In [None]:
def wrangle(db_path):
    # Connect to database
    conn = sqlite3.connect(db_path)

    # Construct query
    query = """
        SELECT distinct(i.building_id) AS b_id,
           s.*,
           d.damage_grade
        FROM id_map AS i
        JOIN building_structure AS s ON i.building_id = s.building_id
        JOIN building_damage AS d ON i.building_id = d.building_id
        WHERE district_id = 4
    """

    # Read query results into DataFrame
    df = pd.read_sql(query, conn, index_col="b_id")
    
    #identify leaky columns
    drop_cols=[col for col in df.columns if  "post_eq" in col]
    
    # Create binary target
    df["damage_grade"]=df["damage_grade"].str[-1].astype(int)
    df["severe_damage"]=(df["damage_grade"]>3).astype(int)   
    
    #Drop old target
    drop_cols.append("damage_grade")
    
    #Drop multicollinearlity columns
    drop_cols.append("count_floors_pre_eq")
    
    #drop high-cardinality catagorical column
    drop_cols.append("building_id")
    
    #drop columns
    df.drop(columns=drop_cols , inplace= True)
    
    return df

In [None]:
#Complete the wrangle function above so that the it returns the results of query as a DataFrame. Be sure that the index column is set to "b_id". Also, the path to the SQLite database is "/home/jovyan/nepal.sqlite"
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

Split

In [None]:
#Create your feature matrix X and target vector y. Your target is "severe_damage"
target = "severe_damage"
X = df.drop(columns=target)
y = df[target]

In [None]:
#Divide your data (X and y) into training and test sets using a randomized train-test split. Your test set should be 20% of your total data. And don't forget to set a random_state for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.2, random_state= 42
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Building Model

In [None]:
y_train.value_counts(normalize=True).max()

In [None]:
#Calculate the baseline accuracy score for your model
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

Iterate

In [None]:
#Create a pipeline named model that contains a OneHotEncoder transformer and a LogisticRegression predictor. Be sure you set the use_cat_names argument for your transformer to True. Then fit it to the training data
# Build model
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression()
)
# Fit model to training data
model.fit(X_train, y_train)

Evaluate

In [None]:
accuracy_score(y_train, model.predict(X_train))

In [None]:
model.score(X_test, y_test)

In [None]:
#Calculate the training and test accuracy scores for your models
acc_train =accuracy_score(y_train, model.predict(X_train))
acc_test = model.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 2))
print("Test Accuracy:", round(acc_test, 2))

Communicate 


In [None]:
model.predict(X_train)[:5]

In [None]:
#Instead of using the predict method with your model, try predict_proba with your training data. How does the predict_proba output differ than that of predict? What does it represent?
y_train_pred_proba = model.predict_proba(X_train)[:5]
print(y_train_pred_proba[:5])

In [None]:
#Extract the feature names and importances from your model
features = model.named_steps["onehotencoder"].get_feature_names()
importances=model.named_steps["logisticregression"].coef_[0]
feat_imp = pd.Series(importances, index=features)
feat_imp

In [None]:
#Create a pandas Series named odds_ratios, where the index is features and the values are your the exponential of the importances. How does odds_ratios for this model look different from the other linear models we made in projects 2 and 3
odds_ratios = pd.Series(np.exp(importances), index=features).sort_values()
odds_ratios.head()

In [None]:
# Horizontal bar chart, five largest coefficients
odds_ratios.tail().plot(kind="barh")
plt.xlabel("Odd Ratio")

In [None]:
# Horizontal bar chart, five smallest coefficients
odds_ratios.head().plot(kind="barh")
plt.xlabel("Odd Ratio")