In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
RANDOM_STATE = 42

In [3]:
df = pd.read_csv('/kaggle/input/nypd-arrests-data-historic-2006-2020/NYPD_Arrests_Data__Historic_.csv')

In [4]:
df.isna().sum()

ARREST_KEY               0
ARREST_DATE              0
PD_CD                  284
PD_DESC               9066
KY_CD                 9066
OFNS_DESC             9066
LAW_CODE               196
LAW_CAT_CD           18861
ARREST_BORO              8
ARREST_PRECINCT          0
JURISDICTION_CODE       10
AGE_GROUP               17
PERP_SEX                 0
PERP_RACE                0
X_COORD_CD               1
Y_COORD_CD               1
Latitude                 1
Longitude                1
Lon_Lat                  1
dtype: int64

In [5]:
df['ARREST_DATE'] = pd.to_datetime(df['ARREST_DATE'], format='%m/%d/%Y')

df['day'] = df['ARREST_DATE'].dt.day
df['month'] = df['ARREST_DATE'].dt.strftime('%B')
df['year'] = df['ARREST_DATE'].dt.year

In [6]:
df.nunique()

ARREST_KEY           5153369
ARREST_DATE             5479
PD_CD                    334
PD_DESC                  422
KY_CD                     75
OFNS_DESC                 87
LAW_CODE                2440
LAW_CAT_CD                 4
ARREST_BORO                5
ARREST_PRECINCT           78
JURISDICTION_CODE         27
AGE_GROUP                 91
PERP_SEX                   2
PERP_RACE                  8
X_COORD_CD             64198
Y_COORD_CD             67039
Latitude              106106
Longitude             107313
Lon_Lat               119133
day                       31
month                     12
year                      15
dtype: int64

In [7]:
### Converting data into numerics

numeric_cols = []
categorical_cols = []

for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
        numeric_cols.append(col)
    except ValueError:
        categorical_cols.append(col)

for col in categorical_cols:
    le = LabelEncoder()
    
    non_null_mask = df[col].notnull()
    
    df.loc[non_null_mask, col] = le.fit_transform(df.loc[non_null_mask, col])



In [8]:
imputer = IterativeImputer(max_iter=10, random_state=RANDOM_STATE)
completed_matrix = imputer.fit_transform(df)

In [9]:
data = pd.DataFrame(completed_matrix,columns= df.columns)

# data = data.sample(frac=0.01, random_state=42)

In [10]:
data

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,...,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat,day,month,year
0,32311380.0,1.182125e+18,511.0000,61.0000,235.000000,15.000000,1362.0,2.0,3.0,27.0,...,1.0,2.0,1.005390e+06,214570.349132,40.755993,-73.923764,69485.072131,18.0,6.0,2007.0
1,192799737.0,1.548461e+18,177.0000,344.0000,116.000000,78.000000,728.0,0.0,2.0,25.0,...,1.0,2.0,1.000555e+06,230994.000000,40.800694,-73.941109,74049.000000,26.0,4.0,2019.0
2,193260691.0,1.549411e+18,589.2818,215.3823,121.116964,44.303792,1413.0,0.0,2.0,14.0,...,1.0,5.0,9.866850e+05,215375.000000,40.757839,-73.991212,96230.000000,6.0,3.0,2019.0
3,149117452.0,1.452038e+18,153.0000,316.0000,104.000000,76.000000,684.0,0.0,1.0,67.0,...,1.0,2.0,9.980320e+05,175598.000000,40.648650,-73.950336,79010.000000,6.0,4.0,2016.0
4,190049060.0,1.542240e+18,157.0000,314.0000,104.000000,76.000000,689.0,0.0,1.0,77.0,...,1.0,2.0,1.003606e+06,185050.000000,40.674583,-73.930222,68582.000000,15.0,9.0,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5153364,207601040.0,1.578442e+18,273.0000,368.0000,121.000000,12.000000,909.0,0.0,2.0,1.0,...,1.0,3.0,9.807720e+05,194845.000000,40.701489,-74.012545,103780.000000,8.0,4.0,2020.0
5153365,206891807.0,1.577837e+18,113.0000,230.0000,344.000000,7.000000,535.0,2.0,1.0,90.0,...,1.0,2.0,9.935140e+05,197321.000000,40.708281,-73.966587,86675.000000,1.0,4.0,2020.0
5153366,207760542.0,1.578701e+18,339.0000,195.0000,341.000000,72.000000,939.0,2.0,2.0,13.0,...,1.0,6.0,9.864640e+05,208227.000000,40.738220,-73.992012,96542.000000,11.0,4.0,2020.0
5153367,206896678.0,1.577837e+18,105.0000,365.0000,106.000000,23.000000,598.0,0.0,3.0,111.0,...,1.0,2.0,1.053534e+06,211808.000000,40.747777,-73.749952,5349.000000,1.0,4.0,2020.0


In [11]:
### Labels

perp_race_label = data['PERP_RACE']
perp_sex_label = data['PERP_SEX']
age_group_label = data['AGE_GROUP']
data.drop(['PERP_RACE','PERP_SEX','AGE_GROUP'],axis=1,inplace=True)

### Prelim Tests to understand which model performs better

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data, perp_race_label, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=20,random_state=42))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.5439022023867275
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        25
         1.0       0.38      0.22      0.28       437
         2.0       0.62      0.80      0.70      5044
         3.0       0.22      0.07      0.11       793
         4.0       0.00      0.00      0.00         1
         5.0       0.00      0.00      0.00        96
         6.0       0.44      0.34      0.38      1223
         7.0       0.43      0.38      0.40      2688

    accuracy                           0.54     10307
   macro avg       0.26      0.23      0.23     10307
weighted avg       0.50      0.54      0.51     10307

Confusion Matrix:
 [[   0    2   10    0    0    0    5    8]
 [   0   96  161    5    0    0   57  118]
 [   1   58 4021   86    0    4  199  675]
 [   0    9  413   59    0    1   53  258]
 [   0    0    1    0    0    0    0    0]
 [   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, random_state=42))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.5337149510041719
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        25
         1.0       0.40      0.10      0.16       437
         2.0       0.54      0.95      0.69      5044
         3.0       0.00      0.00      0.00       793
         4.0       0.00      0.00      0.00         1
         5.0       0.20      0.01      0.02        96
         6.0       0.51      0.17      0.26      1223
         7.0       0.48      0.17      0.25      2688

    accuracy                           0.53     10307
   macro avg       0.27      0.18      0.17     10307
weighted avg       0.47      0.53      0.44     10307

Confusion Matrix:
 [[   0    0   13    0    0    0    3    9]
 [   1   43  277    0    0    0   23   93]
 [   3   22 4788    0    3    1   89  138]
 [   1    5  706    0    1    0   17   63]
 [   0    0    1    0    0    0    0    0]
 [   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from sklearn.svm import SVC

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC(kernel='rbf', random_state=42, probability=True))
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")

### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.5237217425050936
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        25
         1.0       0.00      0.00      0.00       437
         2.0       0.53      0.96      0.68      5044
         3.0       0.00      0.00      0.00       793
         4.0       0.00      0.00      0.00         1
         5.0       0.00      0.00      0.00        96
         6.0       0.50      0.09      0.16      1223
         7.0       0.47      0.16      0.24      2688

    accuracy                           0.52     10307
   macro avg       0.19      0.15      0.14     10307
weighted avg       0.44      0.52      0.42     10307

Confusion Matrix:
 [[   0    0   19    0    0    0    0    6]
 [   0    0  321    0    0    0    8  108]
 [   0    1 4841    0    0    0   51  151]
 [   0    0  723    0    0    0    8   62]
 [   0    0    1    0    0    0    0    0]
 [   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
from sklearn.linear_model import LogisticRegression

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")

### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.49044338798874554
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        25
         1.0       0.00      0.00      0.00       437
         2.0       0.50      0.97      0.66      5044
         3.0       0.00      0.00      0.00       793
         4.0       0.00      0.00      0.00         1
         5.0       0.00      0.00      0.00        96
         6.0       0.38      0.11      0.17      1223
         7.0       0.23      0.00      0.00      2688

    accuracy                           0.49     10307
   macro avg       0.14      0.14      0.10     10307
weighted avg       0.35      0.49      0.34     10307

Confusion Matrix:
 [[   0    0   25    0    0    0    0    0]
 [   0    0  426    0    0    0    9    2]
 [   0    0 4915    0    0    0  123    6]
 [   0    0  776    0    0    0   15    2]
 [   0    0    1    0    0    0    0    0]
 [   0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

X_train, X_test, y_train_sex, y_test_sex = train_test_split(data, perp_sex_label, test_size=0.2, random_state=42)


# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=20,random_state=42))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)


train_and_evaluate(X_train, X_test, y_train_sex, y_test_sex, "PERP_SEX")


### Training for PERP_SEX ###
Metrics for PERP_SEX:
Accuracy: 0.8236150189191811
Classification Report:
               precision    recall  f1-score   support

         0.0       0.41      0.14      0.20      1722
         1.0       0.85      0.96      0.90      8585

    accuracy                           0.82     10307
   macro avg       0.63      0.55      0.55     10307
weighted avg       0.77      0.82      0.78     10307

Confusion Matrix:
 [[ 233 1489]
 [ 329 8256]]
--------------------------------------------------


In [30]:
# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, random_state=42))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

train_and_evaluate(X_train, X_test, y_train_sex, y_test_sex, "PERP_SEX")

### Training for PERP_SEX ###
Metrics for PERP_SEX:
Accuracy: 0.8366158921121568
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.03      0.05      1722
         1.0       0.84      1.00      0.91      8585

    accuracy                           0.84     10307
   macro avg       0.85      0.51      0.48     10307
weighted avg       0.84      0.84      0.77     10307

Confusion Matrix:
 [[  45 1677]
 [   7 8578]]
--------------------------------------------------


In [16]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


X_train, X_test, y_train_sex, y_test_sex = train_test_split(data, perp_sex_label, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC(kernel='rbf', random_state=42, probability=True))
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_sex, y_test_sex, "PERP_SEX")

### Training for PERP_SEX ###
Metrics for PERP_SEX:
Accuracy: 0.8329290773260891
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1722
         1.0       0.83      1.00      0.91      8585

    accuracy                           0.83     10307
   macro avg       0.42      0.50      0.45     10307
weighted avg       0.69      0.83      0.76     10307

Confusion Matrix:
 [[   0 1722]
 [   0 8585]]
--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_sex, y_test_sex, "PERP_SEX")

### Training for PERP_SEX ###
Metrics for PERP_SEX:
Accuracy: 0.8329290773260891
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1722
         1.0       0.83      1.00      0.91      8585

    accuracy                           0.83     10307
   macro avg       0.42      0.50      0.45     10307
weighted avg       0.69      0.83      0.76     10307

Confusion Matrix:
 [[   0 1722]
 [   0 8585]]
--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Train-test split for regression
X_train, X_test, y_train, y_test = train_test_split(data, age_group_label, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())  # Standardize features
])

# Define a function to train and evaluate regressors
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Regression pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=20, random_state=42))
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 488.6632251164257
R^2 Score: -0.04336735799658942
--------------------------------------------------


In [20]:
from sklearn.linear_model import LinearRegression

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())  # Standardize features
])

# Define a function to train and evaluate regressors
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Regression pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
        
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 462.2426702498882
R^2 Score: 0.0130443855336001
--------------------------------------------------


In [21]:
from sklearn.ensemble import GradientBoostingRegressor


# Define a function to train and evaluate regressors
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Regression pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
    ])

        
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 449.2655206468518
R^2 Score: 0.04075249533134617
--------------------------------------------------


### Final tests with finalized Model

In [28]:
data = pd.DataFrame(completed_matrix,columns= df.columns)


In [29]:
### Labels

perp_race_label = data['PERP_RACE']
perp_sex_label = data['PERP_SEX']
age_group_label = data['AGE_GROUP']
data.drop(['PERP_RACE','PERP_SEX','AGE_GROUP'],axis=1,inplace=True)

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data, perp_race_label, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing the total process
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total process
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print timing information
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)


# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.5852849688650339
Classification Report:
               precision    recall  f1-score   support

         0.0       0.22      0.08      0.12      2288
         1.0       0.47      0.37      0.42     42460
         2.0       0.68      0.78      0.73    499637
         3.0       0.30      0.18      0.22     82981
         4.0       0.37      0.17      0.23       292
         5.0       0.30      0.13      0.18     10369
         6.0       0.49      0.45      0.47    124183
         7.0       0.49      0.46      0.47    268464

    accuracy                           0.59   1030674
   macro avg       0.42      0.33      0.36   1030674
weighted avg       0.56      0.59      0.57   1030674

Confusion Matrix:
 [[   186    236    951     63      0     14    315    523]
 [   100  15878  10872    826      6    335   5839   8604]
 [   229   5636 392082  13792     23   1009  22221  64645]
 [    29   1041  34961  14562      5    237   

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

X_train, X_test, y_train_sex, y_test_sex = train_test_split(data, perp_sex_label, test_size=0.2, random_state=42)


# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())                 # Standardize features
])

def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing the total process
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total process
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print timing information
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)



train_and_evaluate(X_train, X_test, y_train_sex, y_test_sex, "PERP_SEX")


### Training for PERP_SEX ###
Metrics for PERP_SEX:
Accuracy: 0.824535207058682
Classification Report:
               precision    recall  f1-score   support

         0.0       0.46      0.28      0.35    172824
         1.0       0.87      0.93      0.90    857850

    accuracy                           0.82   1030674
   macro avg       0.66      0.61      0.62   1030674
weighted avg       0.80      0.82      0.81   1030674

Confusion Matrix:
 [[ 47946 124878]
 [ 55969 801881]]
--------------------------------------------------
Training Time: 461.9811 seconds
Prediction Time: 25.2283 seconds
Total Time: 487.2094 seconds
--------------------------------------------------


In [32]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


X_train, X_test, y_train, y_test = train_test_split(data, age_group_label, test_size=0.2, random_state=42)

# Define preprocessing pipeline
preprocessor = Pipeline([
    ('scaler', StandardScaler())  # Standardize features
])

# Define a function to train and evaluate regressors with timing
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Start timing the total execution
    start_time = time.time()
    
    # Regression pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total execution
    end_time = time.time()
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 459.6930960103748
R^2 Score: 0.015514784379912494
--------------------------------------------------
Training Time: 6.6240 seconds
Prediction Time: 0.1402 seconds
Total Time: 6.7642 seconds
--------------------------------------------------


### Randomizing dimension minimizing algorithm

### PCA

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = scaler.transform(data)

pca = PCA(n_components=10)  # Reduce to 10 dimensions
data_pca = pca.fit_transform(data_scaled)
data_pca = pca.transform(data_scaled)

In [17]:
data_pca.shape

(5153369, 10)

In [20]:
import time

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Assuming `data` and `perp_race_label` are defined and contain the dataset and target labels respectively

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data_pca, perp_race_label, test_size=0.2, random_state=42)

# Define a function to train and evaluate models with timing
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Train the model
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Predictions
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.5767284320745454
Classification Report:
               precision    recall  f1-score   support

         0.0       0.26      0.08      0.12      2288
         1.0       0.46      0.33      0.39     42460
         2.0       0.66      0.80      0.72    499637
         3.0       0.31      0.16      0.21     82981
         4.0       0.45      0.19      0.27       292
         5.0       0.35      0.12      0.18     10369
         6.0       0.49      0.40      0.44    124183
         7.0       0.48      0.44      0.46    268464

    accuracy                           0.58   1030674
   macro avg       0.43      0.31      0.35   1030674
weighted avg       0.55      0.58      0.56   1030674

Confusion Matrix:
 [[   181    202   1041     54      0     14    274    522]
 [    79  14078  12965    643      5    240   5381   9069]
 [   175   5116 397251  11820     23    760  20350  64142]
 [    29    957  37824  13409      4    179   

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.random_projection import GaussianRandomProjection

# Assuming `data` and `perp_race_label` are defined and contain the dataset and target labels respectively

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data_pca, perp_sex_label, test_size=0.2, random_state=42)



# Define a function to train and evaluate models with timing
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing the total execution
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total execution
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.8248379215930547
Classification Report:
               precision    recall  f1-score   support

         0.0       0.46      0.24      0.32    172824
         1.0       0.86      0.94      0.90    857850

    accuracy                           0.82   1030674
   macro avg       0.66      0.59      0.61   1030674
weighted avg       0.79      0.82      0.80   1030674

Confusion Matrix:
 [[ 42274 130550]
 [ 49985 807865]]
--------------------------------------------------
Training Time: 1035.3231 seconds
Prediction Time: 24.1396 seconds
Total Time: 1059.4628 seconds
--------------------------------------------------


In [23]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


X_train, X_test, y_train, y_test = train_test_split(data_pca, age_group_label, test_size=0.2, random_state=42)

# Define a function to train and evaluate regressors with timing
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Start timing the total execution
    start_time = time.time()
    
    # Regression pipeline
    pipeline = Pipeline([
        ('regressor', LinearRegression())
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total execution
    end_time = time.time()
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 460.9257913213122
R^2 Score: 0.012874826722261146
--------------------------------------------------
Training Time: 2.5265 seconds
Prediction Time: 0.0184 seconds
Total Time: 2.5450 seconds
--------------------------------------------------


### Gaussian Random Projection

In [24]:
from sklearn.random_projection import GaussianRandomProjection
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = scaler.transform(data)

# Dimensionality reduction using Gaussian Random Projection
grp = GaussianRandomProjection(n_components=10, random_state=42)  # Reduce to 10 dimensions
data_grp = grp.fit_transform(data_scaled)


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Assuming `data` and `perp_race_label` are defined and contain the dataset and target labels respectively

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data_grp, perp_race_label, test_size=0.2, random_state=42)

# Define a function to train and evaluate models with timing
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Train the model
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Predictions
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.558701393457097
Classification Report:
               precision    recall  f1-score   support

         0.0       0.29      0.08      0.12      2288
         1.0       0.45      0.28      0.35     42460
         2.0       0.63      0.80      0.70    499637
         3.0       0.32      0.15      0.20     82981
         4.0       0.46      0.17      0.25       292
         5.0       0.36      0.11      0.17     10369
         6.0       0.46      0.34      0.39    124183
         7.0       0.47      0.41      0.43    268464

    accuracy                           0.56   1030674
   macro avg       0.43      0.29      0.33   1030674
weighted avg       0.53      0.56      0.53   1030674

Confusion Matrix:
 [[   180    168   1175     37      0     12    239    477]
 [    68  12089  15899    554      3    171   4537   9139]
 [   157   4855 399248  10887     18    698  19699  64075]
 [    31    824  42279  12164      3    160   3

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.random_projection import GaussianRandomProjection

# Assuming `data` and `perp_race_label` are defined and contain the dataset and target labels respectively

# Train-test split for each label
X_train, X_test, y_train_race, y_test_race = train_test_split(data_grp, perp_sex_label, test_size=0.2, random_state=42)



# Define a function to train and evaluate models with timing
def train_and_evaluate(X_train, X_test, y_train, y_test, label_name):
    print(f"### Training for {label_name} ###")
    
    # Start timing the total execution
    start_time = time.time()
    
    # Model pipeline
    pipeline = Pipeline([
        ('classifier', RandomForestClassifier(n_estimators=20, random_state=42))
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total execution
    end_time = time.time()
    
    # Metrics
    print(f"Metrics for {label_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate for each label
train_and_evaluate(X_train, X_test, y_train_race, y_test_race, "PERP_RACE")


### Training for PERP_RACE ###
Metrics for PERP_RACE:
Accuracy: 0.8249446478711988
Classification Report:
               precision    recall  f1-score   support

         0.0       0.45      0.22      0.30    172824
         1.0       0.86      0.95      0.90    857850

    accuracy                           0.82   1030674
   macro avg       0.66      0.58      0.60   1030674
weighted avg       0.79      0.82      0.80   1030674

Confusion Matrix:
 [[ 38002 134822]
 [ 45603 812247]]
--------------------------------------------------
Training Time: 1127.5796 seconds
Prediction Time: 24.9914 seconds
Total Time: 1152.5711 seconds
--------------------------------------------------


In [27]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


X_train, X_test, y_train, y_test = train_test_split(data_grp, age_group_label, test_size=0.2, random_state=42)

# Define a function to train and evaluate regressors with timing
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    print("### Training for AGE_GROUP (Regression) ###")
    
    # Start timing the total execution
    start_time = time.time()
    
    # Regression pipeline
    pipeline = Pipeline([
        ('regressor', LinearRegression())
    ])
    
    # Measure training time
    training_start = time.time()
    pipeline.fit(X_train, y_train)
    training_end = time.time()
    
    # Measure prediction time
    prediction_start = time.time()
    y_pred = pipeline.predict(X_test)
    prediction_end = time.time()
    
    # End timing the total execution
    end_time = time.time()
    
    # Metrics
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))
    print("-" * 50)
    
    # Print running times
    print(f"Training Time: {training_end - training_start:.4f} seconds")
    print(f"Prediction Time: {prediction_end - prediction_start:.4f} seconds")
    print(f"Total Time: {end_time - start_time:.4f} seconds")
    print("-" * 50)

# Train and evaluate
train_and_evaluate_regressor(X_train, X_test, y_train, y_test)


### Training for AGE_GROUP (Regression) ###
Mean Squared Error: 461.1913128281955
R^2 Score: 0.012306182119537046
--------------------------------------------------
Training Time: 2.6157 seconds
Prediction Time: 0.0200 seconds
Total Time: 2.6358 seconds
--------------------------------------------------
