In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

from mlops_ods.config import compose_config

In [2]:
cfg = compose_config(overrides=['settings.path_to_data=dataset'])

# Data
https://www.kaggle.com/datasets/new-york-city/ny-2015-street-tree-census-tree-data

Street tree data 2015. Tree data collected includes tree species, diameter and perception of health.

In [3]:
def download_kaggle_dataset_if_not_exist(path_to_data_folder: str,
                                         file_name_with_data: str) -> None:
    """
    Download dataset from kaggle if it's not already exists in path_to_data folder with file_name name

    :param path_to_data_folder: path to folder with data
    :param file_name_with_data: name of file with data
    :return: None
    """
    # Check if the file exists in the folder
    file_path = os.path.join(path_to_data_folder, file_name_with_data)
    if not os.path.exists(file_path):
        # If not, download it
        try:
            !kaggle datasets download -d new-york-city/ny-2015-street-tree-census-tree-data -p $path_to_data --unzip
            print(f"Downloaded {file_name_with_data} from Kaggle")
        except Exception as e:
            print(f"Error downloading {file_name_with_data}: {str(e)}")
    else:
        print(f"{file_name_with_data} already exists")

In [4]:
path_to_data = os.path.join(os.path.dirname(os.getcwd()), cfg.settings.path_to_data)
file_name = cfg.settings.file_name

In [5]:
download_kaggle_dataset_if_not_exist(path_to_data, file_name)

2015-street-tree-census-tree-data.csv already exists


# Task
Let's predict health of tree if it's alive

In [6]:
df = pd.read_csv(f"{path_to_data}/{file_name}")

In [7]:
df = df[~df['health'].isna()]

In [8]:
df['health'].value_counts()

health
Good    528850
Fair     96504
Poor     26818
Name: count, dtype: int64

Most of trees are good. We need to predict health of trees, because if we predict that health will be worse in future, that we can do something to cure tree and avoiding to trees dead.

Let's drop not informative columns

In [9]:
drop_cols = ['block_id','created_at','status','address','latitude', 'longitude',
             'x_sp','y_sp','bin', 'bbl','census tract','state','council district',
             'boro_ct','nta','st_senate','st_assem','cncldist','postcode','community board',
             'borocode','stump_diam','spc_latin','nta_name']
df.drop(drop_cols, axis=1, inplace=True)

# Preprocessing

In [10]:
def yes_no_to_numeric(column: pd.Series) -> pd.Series:
    """
    Take column with two values and convert it to binary numeric:
    {'Yes':1, 'No':0}

    :param column: column with two values 'Yes' and 'No'
    :return: column with two values 1 and 0
    """
    return (column == 'Yes') * 1

In [11]:
columns_yes_no = ['root_stone','root_grate','root_other','trunk_wire','trnk_light','trnk_other','brch_light','brch_shoe','brch_other',]
for col in columns_yes_no:
    df[col] = yes_no_to_numeric(df[col])

In [12]:
df['curb_loc'] = (df['curb_loc'] == 'OnCurb') * 1
df['sidewalk'] = np.where(df['sidewalk'] == 'Damage', 1, 0)
df['steward'] = df['steward'].map({'1or2': 1,'3or4': 2,'4orMore':3}).fillna(0).astype(int)
df['guards'] = df['guards'].map({'Harmful': 1,'Unsure': 2,'Helpful':3}).fillna(0).astype(int) 
df['spc_common'] = df['spc_common'].fillna('n/d')
df['problems'] = df['problems'].fillna('').apply(lambda x: len(x.split(',')))
df['health'] = df['health'].map({'Poor': 0,'Fair': 1,'Good':2}).astype(int) 

In [13]:
df.shape

(652172, 21)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 652172 entries, 0 to 683787
Data columns (total 21 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   tree_id     652172 non-null  int64 
 1   tree_dbh    652172 non-null  int64 
 2   curb_loc    652172 non-null  int64 
 3   health      652172 non-null  int64 
 4   spc_common  652172 non-null  object
 5   steward     652172 non-null  int64 
 6   guards      652172 non-null  int64 
 7   sidewalk    652172 non-null  int64 
 8   user_type   652172 non-null  object
 9   problems    652172 non-null  int64 
 10  root_stone  652172 non-null  int64 
 11  root_grate  652172 non-null  int64 
 12  root_other  652172 non-null  int64 
 13  trunk_wire  652172 non-null  int64 
 14  trnk_light  652172 non-null  int64 
 15  trnk_other  652172 non-null  int64 
 16  brch_light  652172 non-null  int64 
 17  brch_shoe   652172 non-null  int64 
 18  brch_other  652172 non-null  int64 
 19  zip_city    652172 non-null 

# model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('health', axis=1), df['health'], 
                                                    test_size=0.3, random_state=42, stratify=df['health'])

In [16]:
num_cols = cfg.features.numerical

In [17]:
clf = LogisticRegression(random_state=42, max_iter=300, C=10)
clf.fit(X_train[num_cols], y_train)

In [18]:
preds = clf.predict_proba(X_test[num_cols])
# preds = np.argmax(preds, axis=1)

In [19]:
roc_auc_score(y_test, preds, multi_class='ovr')

0.6146971836606226

In [20]:
###

In [21]:
clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(X_train[num_cols], y_train)

In [22]:
preds = clf.predict_proba(X_test[num_cols])
roc_auc_score(y_test, preds, multi_class='ovr')

0.6163755142226428

In [23]:
###

In [24]:
cat_cols = cfg.features.categorical
total_cols = num_cols + cat_cols

In [25]:
clf = CatBoostClassifier(iterations=cfg.model.iterations, 
                         verbose=cfg.model.verbose,
                         random_seed=cfg.model.random_seed,
                         cat_features=cat_cols)
clf.fit(X_train[total_cols], y_train)

<catboost.core.CatBoostClassifier at 0x31f99e490>

In [26]:
preds = clf.predict_proba(X_test[total_cols])
roc_auc_score(y_test, preds, multi_class='ovr')

0.7083698883944992

In [33]:
# sample data
X_train.loc[503146, total_cols].to_dict()

{'tree_dbh': 22,
 'curb_loc': 1,
 'steward': 0,
 'guards': 0,
 'sidewalk': 1,
 'problems': 1,
 'root_stone': 0,
 'root_grate': 0,
 'root_other': 0,
 'trunk_wire': 0,
 'trnk_light': 0,
 'trnk_other': 0,
 'brch_light': 1,
 'brch_shoe': 0,
 'brch_other': 0,
 'spc_common': 'green ash',
 'zip_city': 'Ozone Park',
 'borough': 'Queens',
 'user_type': 'NYC Parks Staff'}

In [34]:
y_train.loc[503146]

2

In [42]:
health_classes = {0: 'Poor',1: 'Fair', 2: 'Good'}
preds_class = clf.predict(X_train.loc[503146, total_cols])[0]

In [43]:
health_classes[preds_class]

'Good'

In [47]:
{name_health: round(score, 4) for name_health, score in zip(health_classes.values(), clf.predict_proba(X_train.loc[503146, total_cols]))}

{'Poor': 0.0178, 'Fair': 0.1503, 'Good': 0.8319}