In [1]:
# Import necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import time
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df['MEDV'] = boston.target

# Exploratory Data Analysis
# 1. Summary statistics for numerical columns
summary_stats = boston_df.describe()
print("Summary Statistics for Numerical Columns:")
print(summary_stats)

# 2. Check for missing values
missing_values = boston_df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# 3. Distinguish column types
numerical_columns = boston_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = boston_df.select_dtypes(exclude=[np.number]).columns.tolist()
print("\nNumerical Columns:", numerical_columns)
print("Categorical Columns:", categorical_columns)

# 4. Correlation matrix for numerical columns
correlation_matrix = boston_df.corr()

# Save correlation matrix heatmap
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'correlation_matrix_{timestamp}.png'
file_path = save_dir / file_name
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')


ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import time
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_df['MedHouseVal'] = housing.target

# Exploratory Data Analysis
# 1. Summary statistics for numerical columns
summary_stats = housing_df.describe()
print("Summary Statistics for Numerical Columns:")
print(summary_stats)

# 2. Check for missing values
missing_values = housing_df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# 3. Distinguish column types
numerical_columns = housing_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = housing_df.select_dtypes(exclude=[np.number]).columns.tolist()
print("\nNumerical Columns:", numerical_columns)
print("Categorical Columns:", categorical_columns)

# 4. Correlation matrix for numerical columns
correlation_matrix = housing_df.corr()

# Save correlation matrix heatmap
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'correlation_matrix_{timestamp}.png'
file_path = save_dir / file_name
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')

Summary Statistics for Numerical Columns:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.563400     18.000000      4.440716      1.006079    787.000000   
50%        3.534800     29.000000      5.229129      1.048780   1166.000000   
75%        4.743250     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude   MedHouseVal  
count  20640.000000  20640.000000  20640.000000  20640.000000  
mean       3.070655     35.631861   -119.569704      2.068558  
std       10.386050      2.135952      2.003532      1

Image saved to: /Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image/correlation_matrix_1725453462.png


<Figure size 1200x800 with 0 Axes>

In [3]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming 'housing_df' is the DataFrame variable from 'Finished Tasks'
column_info = get_column_info(housing_df)
print("column_info")
print(column_info)


2024-09-04 20:37:47.555 | INFO     | metagpt.const:get_metagpt_package_root:21 - Package root set to /Users/tuozhou/Desktop/RA/SZRI/ChatPilot


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal'], 'Datetime': [], 'Others': []}


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Splitting the data into training and testing sets
X = housing_df.drop('MedHouseVal', axis=1)
y = housing_df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adding a constant to the model (intercept)
X_train_const = sm.add_constant(X_train)

# Fitting the linear regression model
model = sm.OLS(y_train, X_train_const).fit()

# Getting the summary of the model
summary = model.summary()

# Printing the summary
print(summary)

# Discussing the economic results
# The coefficients of the regression model provide insights into the relationship 
# between the features and the target variable (median house value). 
# For instance, the coefficient for 'MedInc' (Median Income) might be high, 
# indicating that an increase in median income is strongly associated with an 
# increase in house prices. Similarly, other coefficients would provide insights 
# into how other features affect house prices.

# Note: For a more accurate economic interpretation, it is important to consider 
# the magnitude, sign, and statistical significance (p-values) of the coefficients 
# along with domain knowledge.


                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        20:37:55   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -37.0233      0.728    -50.835      0.0

In [5]:
from metagpt.tools.libs.data_preprocess import get_column_info

# Assuming housing_df is the DataFrame variable from 'Finished Tasks'
column_info = get_column_info(housing_df)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal'], 'Datetime': [], 'Others': []}


In [6]:
# Import necessary libraries
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Split the data into features and target variable
X = housing_df.drop('MedHouseVal', axis=1)
y = housing_df['MedHouseVal']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting Regressor model
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=4, random_state=42)
gbr.fit(X_train, y_train)

# Make predictions
y_train_pred = gbr.predict(X_train)
y_test_pred = gbr.predict(X_test)

# Calculate performance metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print performance metrics
print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R^2: {train_r2}")
print(f"Testing R^2: {test_r2}")

# Feature importance
feature_importance = gbr.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
timestamp = int(time.time())
save_dir = Path("/Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image")
save_dir.mkdir(parents=True, exist_ok=True)
file_name = f'feature_importance_{timestamp}.png'
file_path = save_dir / file_name

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.savefig(file_path)
plt.clf()
print(f'Image saved to: {file_path}')


Training MSE: 0.11935461834406613
Testing MSE: 0.21870825346106096
Training R^2: 0.9107147189478076
Testing R^2: 0.8330992407006734
Image saved to: /Users/tuozhou/Desktop/RA/SZRI/ML_Assistant/data/output/image/feature_importance_1725453507.png


<Figure size 1200x800 with 0 Axes>

In [7]:
# Evaluate the linear regression model on the test data
X_test_const = sm.add_constant(X_test)
y_pred = model.predict(X_test_const)

# Calculate Mean Squared Error and R^2 score
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print("Linear Regression Testing MSE:", test_mse)
print("Linear Regression Testing R^2:", test_r2)

# Print the regression summary statistics table
print(summary)


Linear Regression Testing MSE: 1529.399378989649
Linear Regression Testing R^2: -1166.1160716883392
                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.612
Method:                 Least Squares   F-statistic:                     3261.
Date:                Wed, 04 Sep 2024   Prob (F-statistic):               0.00
Time:                        20:37:55   Log-Likelihood:                -17998.
No. Observations:               16512   AIC:                         3.601e+04
Df Residuals:                   16503   BIC:                         3.608e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [8]:
# Discussing the economic results based on the regression tables

# The regression summary table provides us with several key pieces of information:
# 1. Coefficients (coef): These represent the estimated change in the dependent variable (MedHouseVal) for a one-unit change in the predictor variable, holding all other predictors constant.
# 2. Standard Errors (std err): These measure the average amount that the coefficient estimates vary from the actual average value of our response variable.
# 3. t-values and P>|t|: These help us determine the significance of each predictor. A low p-value (< 0.05) indicates that we can reject the null hypothesis and conclude that the predictor is statistically significant.
# 4. Confidence Intervals [0.025, 0.975]: These provide a range of values which are believed to contain the true value of the coefficient with a certain level of confidence (usually 95%).

# Let's interpret the coefficients:
# - MedInc (Median Income): The coefficient is 0.4487, which means that for every one-unit increase in median income, the median house value increases by approximately 0.4487 units, holding all other variables constant. This is highly significant (p-value < 0.000).
# - HouseAge: The coefficient is 0.0097, indicating that older houses tend to have slightly higher values, though the effect is relatively small.
# - AveRooms (Average Rooms per Dwelling): The coefficient is -0.1233, suggesting that an increase in the average number of rooms per dwelling is associated with a decrease in median house value. This might seem counterintuitive, but it could be due to multicollinearity or other underlying factors.
# - AveBedrms (Average Bedrooms per Dwelling): The coefficient is 0.7831, indicating that more bedrooms per dwelling are associated with higher house values.
# - Population: The coefficient is very close to zero and not statistically significant, suggesting that population size does not have a meaningful impact on house values in this model.
# - AveOccup (Average Occupancy per Household): The coefficient is -0.0035, indicating that higher occupancy rates are associated with slightly lower house values.
# - Latitude and Longitude: Both have negative coefficients, indicating that houses located further north and west tend to have lower values. This could be due to various regional economic factors.

# Overall, the model explains about 61.3% of the variance in house prices (R-squared = 0.613), which is a decent fit for a linear regression model.

# The Gradient Boosting Regressor (GBR) results show a much better fit with a training R^2 of 0.91 and a testing R^2 of 0.83, indicating that it captures the non-linear relationships in the data better than the linear model.

# The feature importance from the GBR can provide additional insights into which features are most influential in predicting house prices.

# Note: The negative R^2 value for the linear regression on the test set suggests that the model is performing poorly on unseen data, indicating potential overfitting or that a linear model is not suitable for this dataset.

# The economic interpretation of these results should consider the context of the housing market in California, where factors like income, house age, and location play significant roles in determining house prices.
