<a href="https://colab.research.google.com/github/raidahmorshed/Complete-Python-3-Bootcamp/blob/master/ML_algorithm_with_CT_files_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import files

## HOLC Grades by Census Tract

In [3]:
uploaded = files.upload()

Saving HOLC Grades on Census Tracts.csv to HOLC Grades on Census Tracts.csv


In [4]:
# File with tract grades and 1940s characteristics
data = pd.read_csv('HOLC Grades on Census Tracts.csv')
data.head()

Unnamed: 0,GISJOIN,G13,P16UND40,P60UP40,P75UP40,PWHITE40,PBLACK40,PASIAN40,PHISP40,PRUFB40,...,EDU25UP40,LAB14UP40,EMP14LAB40,FBW40,GISJOIN2,SHAPE_area,SHAPE_len,grade,GEOID10,GISJOIN_2
0,55007900090,1940.0,37.46,3.08,0.58,99.19,0.81,0.0,0.0,0.97,...,1641,1975,952,273,55007900090,0.00021,0.109902,,55079000000.0,G5500790004100
1,55007900094,1940.0,25.98,5.42,1.38,99.96,0.0,0.04,0.0,0.39,...,1744,2121,1067,229,55007900094,0.000494,0.144064,C,55079010000.0,G5500790005400
2,55007900106,1940.0,30.97,3.97,0.66,100.0,0.0,0.0,0.0,0.28,...,557,755,328,165,55007900106,0.000212,0.096056,B,55079010000.0,G5500790012800
3,55007900120,1940.0,21.78,10.42,2.26,100.0,0.0,0.0,0.08,0.5,...,2357,3043,1426,582,55007900120,6.5e-05,0.043239,C,55079020000.0,G5500790016100
4,55007900121,1940.0,21.53,8.02,1.43,99.96,0.0,0.04,0.0,0.33,...,1630,2183,1034,349,55007900121,6.6e-05,0.035106,C,55079020000.0,G5500790017100


In [5]:
len(data)

7562

In [9]:
# Create column to indicate treatment/control group
data['TREATED'] = data['grade'].notna().astype(int)

In [11]:
# Clean up grades
valid_grades = {'A', 'B', 'C', 'D'}
data['grade'] = data['grade'].str.strip().str.upper()
data['grade'] = data['grade'].apply(lambda x: x if x in valid_grades else np.nan)
holc_grades = data['grade'].unique()
holc_grades

array([nan, 'C', 'B', 'D', 'A'], dtype=object)

In [12]:
# Function to count unique grades in a dataset
def count_grades(df, column_name):
  print(f"{column_name}:\n")
  for grade in holc_grades:
      count = len(df[df[column_name] == grade])
      total_count = len(df[df[column_name].notnull()])
      percentage = (count / total_count) * 100
      print(f"Grade {grade}: {percentage}")
  print("\n")

In [13]:
# Original HOLC grades distribution for tracts
count_grades(data, 'grade')

grade:

Grade nan: 0.0
Grade C: 41.650453955901426
Grade B: 17.493514915693904
Grade D: 35.76523994811932
Grade A: 5.090791180285343




In [14]:
# Create a df for all tracts which were given grades to train ML algorithm
with_grades = data[data['grade'].notnull()]
print(len(with_grades))

6168


In [15]:
# Create a df for all tracts to predict grades for
without_grades = data[data['grade'].isnull()]
print(len(without_grades))

1394


In [16]:
# Ensure the length of the dfs match
len(with_grades)+len(without_grades) == len(data)

True

In [17]:
# Percentage of data with HOLC grades
print((len(with_grades)/len(data)*100))

81.56572335361015


## Dataset Splitting and Cleaning

In [18]:
# ML algorithm setup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [19]:
# Features to base the ML algorithms on
features = [
    'P16UND40', 'P60UP40', 'P75UP40', 'PWHITE40', 'PBLACK40', 'PASIAN40',
    'PHISP40', 'PRUFB40', 'PITFB40', 'PGEFB40', 'PIRFB40', 'PSCFB40',
    'PMEX40', 'PCUBAN40', 'PWFB40', 'PHS40', 'PCOL40', 'PUNEMP40',
    'PFLABF40', 'PPROF40', 'PSEMP40', 'POWN40', 'PVAC40', 'PMULTI40',
    'MRENT40', 'MHMVAL40', 'P15WHT40', 'P60WHT40', 'P15NWHT40', 'P60NWHT40',
    'POP40', 'OHU40', 'HU40', 'EDU25UP40', 'LAB14UP40', 'EMP14LAB40', 'FBW40'
]
X = with_grades[features]
y = with_grades['grade']

In [20]:
# Encode the grade variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [21]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [22]:
# Scale X variable data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
# Map alphabetic grades to numeric grades
label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
print(label_mapping)

{0: 'A', 1: 'B', 2: 'C', 3: 'D'}


# ML Algorithms

## Random Forest

In [26]:
# Set up RF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
rf_model = RandomForestClassifier(random_state=42)

In [27]:
# Fit the model on training data
rf_model.fit(X_train, y_train)

In [28]:
# Predict grades for test data
rf_test_pred = rf_model.predict(X_test)

In [29]:
# RF accuracy and classification report
rf_accuracy = accuracy_score(y_test, rf_test_pred)
rf_classification_report = classification_report(y_test, rf_test_pred)
print("Accuracy:", rf_accuracy,
print("Classification Report:\n", rf_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.45      0.55        75
           1       0.59      0.51      0.55       200
           2       0.66      0.76      0.70       495
           3       0.80      0.77      0.79       464

    accuracy                           0.70      1234
   macro avg       0.69      0.62      0.65      1234
weighted avg       0.70      0.70      0.70      1234

Accuracy: 0.7034035656401945 None


## XGBoost

In [31]:
pip install xgboost



In [32]:
import xgboost as xgb

In [33]:
xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [34]:
# Fit the model on training data
xgboost.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [35]:
# Predict grades for test data
xgboost_test_pred = xgboost.predict(X_test)

In [36]:
# XGBoost accuracy and classification report
xgboost_accuracy = accuracy_score(y_test, xgboost_test_pred)
xgboost_classification_report = classification_report(y_test, xgboost_test_pred)
print("Accuracy:", xgboost_accuracy,
print("Classification Report:\n", xgboost_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.44      0.55        75
           1       0.61      0.56      0.58       200
           2       0.68      0.76      0.71       495
           3       0.81      0.78      0.79       464

    accuracy                           0.72      1234
   macro avg       0.71      0.63      0.66      1234
weighted avg       0.72      0.72      0.71      1234

Accuracy: 0.7155591572123177 None


## LightGBM

In [37]:
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [38]:
lgbm = lgb.LGBMClassifier()

In [39]:
# Fit the model on training data
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9052
[LightGBM] [Info] Number of data points in the train set: 4934, number of used features: 37
[LightGBM] [Info] Start training from score -3.027442
[LightGBM] [Info] Start training from score -1.725120
[LightGBM] [Info] Start training from score -0.866671
[LightGBM] [Info] Start training from score -1.041116


In [40]:
# Predict grades for test data
lgbm_test_pred = lgbm.predict(X_test)

In [41]:
# LightGBM accuracy and classification report
lgbm_accuracy = accuracy_score(y_test, lgbm_test_pred)
lgbm_classification_report = classification_report(y_test, lgbm_test_pred)
print("Accuracy:", lgbm_accuracy,
print("Classification Report:\n", lgbm_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.43      0.54        75
           1       0.62      0.58      0.60       200
           2       0.66      0.75      0.70       495
           3       0.80      0.76      0.78       464

    accuracy                           0.71      1234
   macro avg       0.70      0.63      0.66      1234
weighted avg       0.71      0.71      0.70      1234

Accuracy: 0.7058346839546191 None


## CatBoost

In [42]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [43]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(verbose=False)

In [44]:
# Fit the model on training data
catboost.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7aa9822d7610>

In [45]:
# Predict grades for test data
catboost_test_pred = catboost.predict(X_test)

In [47]:
# CatBoost accuracy and classification report
catboost_accuracy = accuracy_score(y_test, catboost_test_pred)
catboost_classification_report = classification_report(y_test, catboost_test_pred)
print("Accuracy:", catboost_accuracy,
print("Classification Report:\n", catboost_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.47      0.57        75
           1       0.60      0.56      0.58       200
           2       0.67      0.76      0.71       495
           3       0.81      0.77      0.79       464

    accuracy                           0.71      1234
   macro avg       0.70      0.64      0.66      1234
weighted avg       0.71      0.71      0.71      1234

Accuracy: 0.7115072933549432 None


## Extra Trees

In [48]:
from sklearn.ensemble import ExtraTreesClassifier

In [49]:
extratrees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [50]:
# Fit the model on training data
extratrees.fit(X_train, y_train)

In [51]:
# Predict grades for test data
extratrees_test_pred = extratrees.predict(X_test)

In [52]:
# ExtraTrees accuracy and classification report
extratrees_accuracy = accuracy_score(y_test, extratrees_test_pred)
extratrees_classification_report = classification_report(y_test, extratrees_test_pred)
print("Accuracy:", extratrees_accuracy,
print("Classification Report:\n", extratrees_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.45      0.57        75
           1       0.61      0.52      0.56       200
           2       0.65      0.78      0.71       495
           3       0.81      0.75      0.78       464

    accuracy                           0.71      1234
   macro avg       0.71      0.62      0.65      1234
weighted avg       0.71      0.71      0.70      1234

Accuracy: 0.7058346839546191 None


## Ensemble Method

In [53]:
from sklearn.ensemble import VotingClassifier

In [54]:
# Ensemble model with soft voting
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('xgb', xgboost),
    ('lgb', lgbm),
    ('cat', catboost),
    ('extra trees', extratrees)
], voting='soft')

In [55]:
# Fit the model on training data
ensemble_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9052
[LightGBM] [Info] Number of data points in the train set: 4934, number of used features: 37
[LightGBM] [Info] Start training from score -3.027442
[LightGBM] [Info] Start training from score -1.725120
[LightGBM] [Info] Start training from score -0.866671
[LightGBM] [Info] Start training from score -1.041116


In [56]:
# Predict grades for test data
ensemble_test_pred =  ensemble_model.predict(X_test)

In [57]:
# Ensemble accuracy and classification report
ensemble_accuracy = accuracy_score(y_test, ensemble_test_pred)
ensemble_classification_report = classification_report(y_test, ensemble_test_pred)
print("Accuracy:", ensemble_accuracy,
print("Classification Report:\n", ensemble_classification_report))

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.47      0.59        75
           1       0.62      0.56      0.59       200
           2       0.67      0.76      0.71       495
           3       0.81      0.77      0.79       464

    accuracy                           0.72      1234
   macro avg       0.72      0.64      0.67      1234
weighted avg       0.72      0.72      0.71      1234

Accuracy: 0.7163695299837926 None


# Model Validation

In [58]:
# Map grades from alphabetic to numeric
grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4}

In [59]:
# Copy test data and compare grade predictions for each model
comparison_data = y_test.copy()
comparison_data = pd.DataFrame(comparison_data)
comparison_data = comparison_data.reset_index(drop=True)
comparison_data.columns = ['grade']
len(comparison_data)

1234

In [60]:
# Add columns for test data predictions by each model
comparison_data['ensemble_test_pred'] = ensemble_test_pred
comparison_data['extratrees_test_pred'] = extratrees_test_pred
comparison_data['catboost_test_pred'] = catboost_test_pred
comparison_data['lgbm_test_pred'] = lgbm_test_pred
comparison_data['xgboost_test_pred'] = xgboost_test_pred
comparison_data['rf_test_pred'] = rf_test_pred

In [61]:
comparison_data.head()

Unnamed: 0,grade,ensemble_test_pred,extratrees_test_pred,catboost_test_pred,lgbm_test_pred,xgboost_test_pred,rf_test_pred
0,3,3,3,2,3,3,3
1,0,0,0,0,0,0,0
2,3,3,3,3,3,3,2
3,3,3,3,3,3,3,3
4,2,2,2,2,2,2,2


In [62]:
# Calculate difference in grades for each model
comparison_data['ensemble_diff'] = comparison_data['ensemble_test_pred'] - comparison_data['grade']
comparison_data['extratrees_diff'] = comparison_data['extratrees_test_pred'] - comparison_data['grade']
comparison_data['catboost_diff'] = comparison_data['catboost_test_pred'] - comparison_data['grade']
comparison_data['lgbm_diff'] = comparison_data['lgbm_test_pred'] - comparison_data['grade']
comparison_data['xgboost_diff'] = comparison_data['xgboost_test_pred'] - comparison_data['grade']
comparison_data['rf_diff'] = comparison_data['rf_test_pred'] - comparison_data['grade']

In [63]:
# Export comparison data
difference_columns = [col for col in comparison_data.columns if '_diff' in col]
comparison_data = comparison_data[difference_columns]
comparison_data.to_csv('/content/Model Comparison.csv', index=False)

# Datasets for Regression


In [64]:
# Create copy of dataset
data_copy = data.copy()
print(f"Type of data_copy: {type(data_copy)}")
X_predict = data_copy.loc[data_copy['grade'].isnull(), features]
X_predict = scaler.fit_transform(X_predict)

Type of data_copy: <class 'pandas.core.frame.DataFrame'>


In [65]:
# Define each model
rf_model = RandomForestClassifier(random_state=42)
xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
lgbm = lgb.LGBMClassifier()
catboost = CatBoostClassifier(verbose=False)
extratrees = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [66]:
# Train each model
rf_model.fit(X_train, y_train)
xgboost.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
catboost.fit(X_train, y_train)
extratrees.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9052
[LightGBM] [Info] Number of data points in the train set: 4934, number of used features: 37
[LightGBM] [Info] Start training from score -3.027442
[LightGBM] [Info] Start training from score -1.725120
[LightGBM] [Info] Start training from score -0.866671
[LightGBM] [Info] Start training from score -1.041116


In [67]:
# Predict grades for control group
rf_pred = rf_model.predict(X_predict)
xgboost_pred = xgboost.predict(X_predict)
lgbm_pred = lgbm.predict(X_predict)
catboost_pred = catboost.predict(X_predict)
extratrees_pred = extratrees.predict(X_predict)
ensemble_pred = ensemble_model.predict(X_predict)

In [68]:
# Define null grades in the original dataset where the predicted grades would go
null_grade_indices = data_copy['grade'].isnull()

In [69]:
# Create a copy of original HOLC grades + ensemble grades
ensemble_data = data_copy.copy()
ensemble_data.loc[null_grade_indices, 'grade'] = ensemble_pred

grade_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
for column in ensemble_data['grade']:
  ensemble_data['grade'] = ensemble_data['grade'].apply(lambda x: grade_mapping[x] if x in grade_mapping else x)

ensemble_data = ensemble_data[ensemble_data['TREATED'].isin([0, 1])]
len(ensemble_data)

7562

In [70]:
# Count tracts by ensemble predicted grades
ensemble_data[ensemble_data['TREATED'] == 0].value_counts('grade')

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
D,843
C,410
B,119
A,22


In [71]:
# Create a copy of original HOLC grades + CatBoost grades
catboost_data = data_copy.copy()
catboost_data.loc[null_grade_indices, 'grade'] = catboost_pred

grade_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
for column in catboost_data['grade']:
  catboost_data['grade'] = catboost_data['grade'].apply(lambda x: grade_mapping[x] if x in grade_mapping else x)

catboost_data = catboost_data[catboost_data['TREATED'].isin([0, 1])]
len(catboost_data)

7562

In [72]:
# Count tracts by CatBoost predicted grades
catboost_data[catboost_data['TREATED'] == 0].value_counts('grade')

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
D,717
C,543
B,115
A,19


In [73]:
# Create a copy of original HOLC grades + RF grades
rf_data = data_copy.copy()
rf_data.loc[null_grade_indices, 'grade'] = rf_pred

grade_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
for column in rf_data['grade']:
  rf_data['grade'] = rf_data['grade'].apply(lambda x: grade_mapping[x] if x in grade_mapping else x)

rf_data = rf_data[rf_data['TREATED'].isin([0, 1])]
len(rf_data)

7562

In [74]:
# Count tracts by RF predicted grades
rf_data[rf_data['TREATED'] == 0].value_counts('grade')

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
C,745
D,611
B,29
A,9


In [75]:
# Create a copy of original HOLC grades + ExtraTrees grades
extratrees_data = data_copy.copy()
extratrees_data.loc[null_grade_indices, 'grade'] = extratrees_pred

grade_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
for column in extratrees_data['grade']:
  extratrees_data['grade'] = extratrees_data['grade'].apply(lambda x: grade_mapping[x] if x in grade_mapping else x)

extratrees_data = extratrees_data[extratrees_data['TREATED'].isin([0, 1])]
len(extratrees_data)

7562

In [76]:
# Count tracts by ExtraTrees predicted grades
extratrees_data[extratrees_data['TREATED'] == 0].value_counts('grade')

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
D,769
C,598
B,20
A,7


In [77]:
# Export ensemble grades data
ensemble_data.to_csv('/content/Ensemble Model predictions.csv', index=False)
print('Done')

Done


In [88]:
# Export CatBoost grades data
catboost_data.to_csv('/content/CatBoost predictions.csv', index=False)
print('Done')

Done
