In [1]:
import pandas as pd
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df['PRICE'] = boston.target

# Display the first few rows of the dataset
boston_df.head()


ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Load the California housing dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_df['PRICE'] = housing.target

# Split the data into training and testing sets
train_df, test_df = train_test_split(housing_df, test_size=0.2, random_state=42)

# Separating the features and target variable
X_train = train_df.drop(columns=['PRICE'])
y_train = train_df['PRICE']
X_test = test_df.drop(columns=['PRICE'])
y_test = test_df['PRICE']

# Adding a constant to the model (intercept)
X_train_sm = sm.add_constant(X_train)

# Fit the model
model = sm.OLS(y_train, X_train_sm).fit()

# Print model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:34:00   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -37.0233      0.728    -50.835      0.0

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import time

# Distinguish column types
numeric_cols = housing_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = housing_df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = housing_df[numeric_cols].describe()

# Correlation matrix for numeric columns
correlation_matrix = housing_df[numeric_cols].corr()

# Save correlation matrix heatmap
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'correlation_matrix_{timestamp}.png'
file_path = save_dir / file_name
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')

# Display summary statistics
numeric_summary


Image saved to: /Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image/correlation_matrix_1725431645.png


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


<Figure size 1200x800 with 0 Axes>

In [4]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'housing_df' is the DataFrame variable from 'Finished Tasks'
column_info = get_column_info(housing_df)
print("column_info")
print(column_info)


2024-09-04 14:34:10.489 | INFO     | metagpt.const:get_metagpt_package_root:21 - Package root set to /Users/tuozhou/Desktop/RA/SZRI/ChatPilot


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [5]:
# Preprocess the dataset for linear regression

# Copy the DataFrame before processing
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

# Standardize numeric features
from metagpt.tools.libs.data_preprocess import StandardScale

numeric_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
scaler = StandardScale(features=numeric_features)

# Fit and transform the training data
train_df_scaled = scaler.fit_transform(train_df_copy)

# Transform the test data
test_df_scaled = scaler.transform(test_df_copy)

# Prepare the data for linear regression
X_train_scaled = train_df_scaled.drop(columns=['PRICE'])
y_train_scaled = train_df_scaled['PRICE']
X_test_scaled = test_df_scaled.drop(columns=['PRICE'])
y_test_scaled = test_df_scaled['PRICE']

# Add a constant to the model (intercept)
X_train_scaled_sm = sm.add_constant(X_train_scaled)

# Fit the OLS model
model_scaled = sm.OLS(y_train_scaled, X_train_scaled_sm).fit()

# Print the regression summary statistics table
print(model_scaled.summary())


                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:34:19   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0719      0.006    369.848      0.0

In [6]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'train_df_scaled' is the latest DataFrame after scaling
column_info = get_column_info(train_df_scaled)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [7]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train_scaled)
dtest = xgb.DMatrix(X_test_scaled, label=y_test_scaled)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 100
model_xgb = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred_train = model_xgb.predict(dtrain)
y_pred_test = model_xgb.predict(dtest)

# Evaluate the model
train_rmse = mean_squared_error(y_train_scaled, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test_scaled, y_pred_test, squared=False)
train_r2 = r2_score(y_train_scaled, y_pred_train)
test_r2 = r2_score(y_test_scaled, y_pred_test)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R2: {train_r2}")
print(f"Test R2: {test_r2}")


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <04125532-9495-3051-97BC-F23BE76BA2F9> /Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/lib-dynload/../../libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/bin/../lib/libomp.dylib' (no such file)"]


In [8]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for CatBoost
train_pool = Pool(data=X_train_scaled, label=y_train_scaled)
test_pool = Pool(data=X_test_scaled, label=y_test_scaled)

# Set parameters for CatBoost
params = {
    'iterations': 1000,
    'depth': 6,
    'learning_rate': 0.1,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Verbose',
    'use_best_model': True
}

# Train the model
model_catboost = CatBoostRegressor(**params)
model_catboost.fit(train_pool, eval_set=test_pool, verbose=100)

# Make predictions
y_pred_train = model_catboost.predict(train_pool)
y_pred_test = model_catboost.predict(test_pool)

# Evaluate the model
train_rmse = mean_squared_error(y_train_scaled, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test_scaled, y_pred_test, squared=False)
train_r2 = r2_score(y_train_scaled, y_pred_train)
test_r2 = r2_score(y_test_scaled, y_pred_test)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R2: {train_r2}")
print(f"Test R2: {test_r2}")

ModuleNotFoundError: No module named 'catboost'

In [9]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for LightGBM
dtrain = lgb.Dataset(X_train_scaled, label=y_train_scaled)
dtest = lgb.Dataset(X_test_scaled, label=y_test_scaled, reference=dtrain)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_round = 100
model_lgb = lgb.train(params, dtrain, num_round, valid_sets=[dtrain, dtest], early_stopping_rounds=10)

# Make predictions
y_pred_train = model_lgb.predict(X_train_scaled, num_iteration=model_lgb.best_iteration)
y_pred_test = model_lgb.predict(X_test_scaled, num_iteration=model_lgb.best_iteration)

# Evaluate the model
train_rmse = mean_squared_error(y_train_scaled, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test_scaled, y_pred_test, squared=False)
train_r2 = r2_score(y_train_scaled, y_pred_train)
test_r2 = r2_score(y_test_scaled, y_pred_test)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R2: {train_r2}")
print(f"Test R2: {test_r2}")

ModuleNotFoundError: No module named 'lightgbm'

In [10]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'train_df_scaled' is the latest DataFrame after scaling
column_info = get_column_info(train_df_scaled)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [11]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train_scaled)
dtest = xgb.DMatrix(X_test_scaled, label=y_test_scaled)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse'
}

# Train the model
num_rounds = 100
model_xgb = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred_train = model_xgb.predict(dtrain)
y_pred_test = model_xgb.predict(dtest)

# Evaluate the model
train_rmse = mean_squared_error(y_train_scaled, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test_scaled, y_pred_test, squared=False)
train_r2 = r2_score(y_train_scaled, y_pred_train)
test_r2 = r2_score(y_test_scaled, y_pred_test)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R2: {train_r2}")
print(f"Test R2: {test_r2}")


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <04125532-9495-3051-97BC-F23BE76BA2F9> /Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/lib-dynload/../../libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/bin/../lib/libomp.dylib' (no such file)"]


In [12]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for CatBoost (no need for DMatrix as in XGBoost)
X_train, X_test = X_train_scaled, X_test_scaled

y_train, y_test = y_train_scaled, y_test_scaled

# Initialize CatBoostRegressor
model_cb = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    od_type='Iter',
    od_wait=25,
    verbose=False
)

# Train the model
model_cb.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Make predictions
y_pred_train = model_cb.predict(X_train)
y_pred_test = model_cb.predict(X_test)

# Evaluate the model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Train R2: {train_r2}")
print(f"Test R2: {test_r2}")

ModuleNotFoundError: No module named 'catboost'

In [13]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for LightGBM
dtrain = lgb.Dataset(X_train_scaled, label=y_train_scaled)
dtest = lgb.Dataset(X_test_scaled, label=y_test_scaled, reference=dtrain)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
}

# Train the model
num_round = 100
model_lgb = lgb.train(params, dtrain, num_round, valid_sets=[dtest], early_stopping_rounds=10)

# Make predictions
y_pred_train = model_lgb.predict(X_train_scaled, num_iteration=model_lgb.best_iteration)
y_pred_test = model_lgb.predict(X_test_scaled, num_iteration=model_lgb.best_iteration)

# Evaluate the model
train_rmse = mean_squared_error(y_train_scaled, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test_scaled, y_pred_test, squared=False)
train_r2 = r2_score(y_train_scaled, y_pred_train)
test_r2 = r2_score(y_test_scaled, y_pred_test)

print(f'Train RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')
print(f'Train R2: {train_r2}')
print(f'Test R2: {test_r2}')

ModuleNotFoundError: No module named 'lightgbm'

In [14]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'train_df_scaled' is the latest DataFrame after scaling
column_info = get_column_info(train_df_scaled)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [15]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data, split is already done previously
# Training using XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# Set up the parameters for XGBoost
params = {
    'max_depth': 5,
    'eta': 0.1,
    'objective': 'reg:squarederror',
    'eval_metric': ['rmse', 'mae'],
}

# Train the model
model_xgb = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')])

# Predictions
y_pred_train = model_xgb.predict(dtrain)
y_pred_test = model_xgb.predict(dtest)

# Evaluate the model
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Training R^2: {r2_train}")
print(f"Test R^2: {r2_test}")
print(f"Training MSE: {mse_train}")
print(f"Test MSE: {mse_test}")


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <04125532-9495-3051-97BC-F23BE76BA2F9> /Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/lib/python3.10/lib-dynload/../../libomp.dylib' (no such file), '/Users/tuozhou/opt/anaconda3/envs/chatpilot/bin/../lib/libomp.dylib' (no such file)"]


In [16]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data, split is already done previously
# Training using LightGBM
dtrain = lgb.Dataset(X_train_scaled, label=y_train)
dtest = lgb.Dataset(X_test_scaled, label=y_test, reference=dtrain)

# Set up the parameters for LightGBM
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['rmse', 'mae'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
model_lgb = lgb.train(params, dtrain, num_boost_round=100, valid_sets=[dtest], early_stopping_rounds=10)

# Predictions
y_pred_train = model_lgb.predict(X_train_scaled, num_iteration=model_lgb.best_iteration)
y_pred_test = model_lgb.predict(X_test_scaled, num_iteration=model_lgb.best_iteration)

# Evaluate the model
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Training R^2: {r2_train}")
print(f"Test R^2: {r2_test}")
print(f"Training MSE: {mse_train}")
print(f"Test MSE: {mse_test}")

ModuleNotFoundError: No module named 'lightgbm'

In [17]:
import catboost as cb
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data, split is already done previously
# Training using CatBoost
train_pool = cb.Pool(X_train_scaled, y_train)
test_pool = cb.Pool(X_test_scaled, y_test)

# Set up the parameters for CatBoost
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True
}

# Train the model
model_cb = cb.CatBoostRegressor(**params)
model_cb.fit(train_pool, eval_set=test_pool, early_stopping_rounds=10)

# Predictions
y_pred_train = model_cb.predict(X_train_scaled)
y_pred_test = model_cb.predict(X_test_scaled)

# Evaluate the model
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f"Training R^2: {r2_train}")
print(f"Test R^2: {r2_test}")
print(f"Training MSE: {mse_train}")
print(f"Test MSE: {mse_test}")

ModuleNotFoundError: No module named 'catboost'

In [18]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'train_df_scaled' is the latest DataFrame from the finished tasks
column_info = get_column_info(train_df_scaled)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from metagpt.tools.libs.data_preprocess import StandardScale
import pandas as pd

# Load latest train and test data
X_train = train_df.drop(columns=['PRICE'])
y_train = train_df['PRICE']
X_test = test_df.drop(columns=['PRICE'])
y_test = test_df['PRICE']

# Initialize scaler
scaler = StandardScale(features=numeric_features)
train_df_scaled = scaler.fit_transform(train_df)
test_df_scaled = scaler.transform(test_df)

X_train_scaled = train_df_scaled.drop(columns=['PRICE'])
y_train_scaled = train_df_scaled['PRICE']
X_test_scaled = test_df_scaled.drop(columns=['PRICE'])
y_test_scaled = test_df_scaled['PRICE']

# Train a RandomForest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train_scaled)

# Predict and evaluate
y_pred_train = rf_model.predict(X_train_scaled)
y_pred_test = rf_model.predict(X_test_scaled)

print("Train R2 Score:", r2_score(y_train_scaled, y_pred_train))
print("Test R2 Score:", r2_score(y_test_scaled, y_pred_test))
print("Train RMSE:", mean_squared_error(y_train_scaled, y_pred_train, squared=False))
print("Test RMSE:", mean_squared_error(y_test_scaled, y_pred_test, squared=False))


Train R2 Score: 0.9735650982131013
Test R2 Score: 0.8052747336256919
Train RMSE: 0.18798297991977747
Test RMSE: 0.5051432839771741


In [20]:
# Evaluate the model on the test set and print the regression summary statistics table for the estimated coefficients

# Add constant to the test set
X_test_sm = sm.add_constant(X_test)

# Predict using the trained model
y_pred = model.predict(X_test_sm)

# Print the regression summary statistics table for the estimated coefficients
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:38:26   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -37.0233      0.728    -50.835      0.0

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import time
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = df[numeric_cols].describe()

# Correlation matrix for numeric columns
correlation_matrix = df[numeric_cols].corr()

# Save correlation matrix heatmap
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'correlation_matrix_{timestamp}.png'
file_path = save_dir / file_name
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')

# Display summary statistics
numeric_summary


ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import time
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['PRICE'] = housing.target

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = df[numeric_cols].describe()

# Correlation matrix for numeric columns
correlation_matrix = df[numeric_cols].corr()

# Save correlation matrix heatmap
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'correlation_matrix_{timestamp}.png'
file_path = save_dir / file_name
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')

# Display summary statistics
numeric_summary

Image saved to: /Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image/correlation_matrix_1725432145.png


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


<Figure size 1200x800 with 0 Axes>

In [3]:
from metagpt.tools.libs.data_preprocess import get_column_info

column_info = get_column_info(df)
print("column_info")
print(column_info)


2024-09-04 14:42:41.800 | INFO     | metagpt.const:get_metagpt_package_root:21 - Package root set to /Users/tuozhou/Desktop/RA/SZRI/ChatPilot


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [4]:
# Preprocess the Boston house price dataset

# Copy the DataFrame before processing it
df_copy = df.copy()

# Check for missing values
missing_values = df_copy.isnull().sum()
print("Missing values per column:\n", missing_values)

# Since there are no categorical columns, we don't need to apply one-hot encoding
# We will standardize the numeric columns except for the label column 'PRICE'

from metagpt.tools.libs.data_preprocess import StandardScale

# Initialize the StandardScale tool for numeric columns except 'PRICE'
numeric_features = [col for col in numeric_cols if col != 'PRICE']
scaler = StandardScale(features=numeric_features)

# Fit and transform the data
df_scaled = scaler.fit_transform(df_copy)

# Display the first few rows of the scaled DataFrame
df_scaled.head()


Missing values per column:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
PRICE         0
dtype: int64


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,4.526
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,3.585
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,3.521
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,3.413
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,3.422


In [5]:
from metagpt.tools.libs.data_preprocess import get_column_info

column_info = get_column_info(df_scaled)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'], 'Datetime': [], 'Others': []}


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Split the data into training and testing sets
X = df_scaled.drop(columns=['PRICE'])
y = df_scaled['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)

# Add a constant to the model for the intercept
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit the model using statsmodels for detailed summary
sm_model = sm.OLS(y_train, X_train_sm).fit()

# Print the summary statistics table
summary = sm_model.summary()
print(summary)

# Save the summary to a text file
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'regression_summary_{timestamp}.txt'
file_path = save_dir / file_name
with open(file_path, 'w') as f:
    f.write(summary.as_text())
print(f'Summary saved to: {file_path}')


                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:43:20   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0679      0.006    369.104      0.0

In [7]:
# The regression summary statistics table for the estimated coefficients has already been printed in the previous task.
# Let's print it again for clarity and discuss the economic results based on the regression table.

# Re-print the regression summary statistics table
print(summary)

# Discussing the economic results based on the regression table:
# 1. MedInc (Median Income): The coefficient is 0.8524, which is highly significant (p-value < 0.0001). This indicates that an increase in median income is strongly associated with an increase in house prices. Economically, this makes sense as higher income levels generally lead to higher purchasing power and demand for housing.

# 2. HouseAge: The coefficient is 0.1224, which is also highly significant (p-value < 0.0001). This suggests that older houses tend to have higher prices, possibly due to their established locations or historical value.

# 3. AveRooms (Average Rooms per Dwelling): The coefficient is -0.3051, which is significant (p-value < 0.0001). This negative relationship might seem counterintuitive, but it could indicate that houses with more rooms might be older or in less desirable locations.

# 4. AveBedrms (Average Bedrooms per Dwelling): The coefficient is 0.3711, which is significant (p-value < 0.0001). This positive relationship suggests that houses with more bedrooms tend to be more expensive, which aligns with economic expectations.

# 5. Population: The coefficient is -0.0023, which is not significant (p-value = 0.699). This indicates that population density does not have a significant impact on house prices in this dataset.

# 6. AveOccup (Average Occupancy per Dwelling): The coefficient is -0.0366, which is significant (p-value < 0.0001). This negative relationship suggests that higher occupancy rates might be associated with lower house prices, possibly due to overcrowding or lower quality of living conditions.

# 7. Latitude: The coefficient is -0.8966, which is highly significant (p-value < 0.0001). This indicates that houses located further north tend to be less expensive, which could be due to various regional economic factors.

# 8. Longitude: The coefficient is -0.8689, which is highly significant (p-value < 0.0001). This suggests that houses located further west tend to be less expensive, which could be due to regional economic factors or proximity to less desirable areas.

# Overall, the model explains about 61.3% of the variance in house prices (R-squared = 0.613), which is a reasonable fit for this type of data.


                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:43:20   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0679      0.006    369.104      0.0

In [8]:
# Discussing the economic results based on the regression tables

# The regression summary table provides several key insights into the relationship between the features and the target variable (house prices).

# 1. **R-squared and Adjusted R-squared**:
#    - R-squared: 0.613
#    - Adjusted R-squared: 0.612
#    These values indicate that approximately 61.3% of the variance in house prices can be explained by the model. This is a moderate level of explanatory power, suggesting that while the model captures a significant portion of the variance, there are other factors not included in the model that also influence house prices.

# 2. **Coefficients and their significance**:
#    - **MedInc (Median Income)**: The coefficient is 0.8524, which is highly significant (p-value < 0.000). This suggests that an increase in median income is strongly associated with an increase in house prices. Economically, this makes sense as higher income levels typically lead to higher purchasing power and demand for housing.
#    - **HouseAge**: The coefficient is 0.1224, which is also highly significant (p-value < 0.000). This indicates that older houses tend to have higher prices, possibly due to their location in more established neighborhoods.
#    - **AveRooms (Average Rooms per Dwelling)**: The coefficient is -0.3051, which is significant (p-value < 0.000). This negative relationship might seem counterintuitive, but it could be due to multicollinearity with other variables or the fact that larger houses might be located in less desirable areas.
#    - **AveBedrms (Average Bedrooms per Dwelling)**: The coefficient is 0.3711, which is significant (p-value < 0.000). This positive relationship suggests that houses with more bedrooms tend to be more expensive.
#    - **Population**: The coefficient is -0.0023, which is not significant (p-value = 0.699). This indicates that population density does not have a significant impact on house prices in this model.
#    - **AveOccup (Average Occupancy per Household)**: The coefficient is -0.0366, which is significant (p-value < 0.000). This negative relationship suggests that higher occupancy rates are associated with lower house prices, possibly due to overcrowding.
#    - **Latitude**: The coefficient is -0.8966, which is highly significant (p-value < 0.000). This indicates that houses located further north tend to be less expensive.
#    - **Longitude**: The coefficient is -0.8689, which is highly significant (p-value < 0.000). This indicates that houses located further west tend to be less expensive.

# 3. **Economic Interpretation**:
#    - The most economically significant variable is Median Income, which has a strong positive relationship with house prices. This aligns with economic theory, as higher income levels increase the ability to purchase more expensive homes.
#    - The negative coefficients for Latitude and Longitude suggest that geographic location plays a crucial role in determining house prices, with certain areas being more desirable than others.
#    - The significance of HouseAge and AveBedrms indicates that both the age and the number of bedrooms in a house are important factors in determining its price.

# Overall, the regression model provides valuable insights into the factors that influence house prices in California. However, it is important to consider that the model explains only 61.3% of the variance, indicating that other factors not included in the model also play a significant role in determining house prices.
