In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Display basic statistics
print(df.describe())

# Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pairplot for selected features
selected_features = ['RM', 'LSTAT', 'PTRATIO', 'PRICE']
sns.pairplot(df[selected_features])
plt.show()

# Distribution of the target variable
sns.histplot(df['PRICE'], kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Boxplot for numeric features
plt.figure(figsize=(15, 10))
df[numeric_cols].boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot of Numeric Features')
plt.show()


ModuleNotFoundError: No module named 'seaborn'

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

# Load the Boston house price dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = df[numeric_cols].describe()

# Correlation matrix
correlation_matrix = df.corr()

# Visualizations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pairplot for selected features
selected_features = ['RM', 'LSTAT', 'PTRATIO', 'PRICE']
sns.pairplot(df[selected_features])
plt.show()

# Distribution of the target variable
plt.figure(figsize=(8, 6))
sns.histplot(df['PRICE'], kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Boxplot for numeric features
plt.figure(figsize=(15, 10))
df[numeric_cols].boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot for Numeric Features')
plt.show()

# Output the summary statistics and correlation matrix
numeric_summary, correlation_matrix


ModuleNotFoundError: No module named 'seaborn'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

# Load the Boston house price dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = df[numeric_cols].describe()

# Correlation matrix
correlation_matrix = df.corr()

# Visualizations
plt.figure(figsize=(12, 8))
plt.matshow(correlation_matrix, fignum=1)
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.colorbar()
plt.title('Correlation Matrix', pad=20)
plt.show()

# Pairplot for selected features
selected_features = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
pd.plotting.scatter_matrix(df[selected_features], figsize=(12, 12))
plt.show()

# Distribution of the target variable
plt.figure(figsize=(8, 6))
df['MEDV'].hist(bins=30, edgecolor='black')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Boxplot for numeric features
plt.figure(figsize=(15, 10))
df[numeric_cols].boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot for Numeric Features')
plt.show()

# Output the summary statistics and correlation matrix
numeric_summary, correlation_matrix

ModuleNotFoundError: No module named 'matplotlib'

In [3]:
import pandas as pd
import numpy as np

try:
    import seaborn as sns
    import matplotlib.pyplot as plt
    seaborn_available = True
except ImportError:
    seaborn_available = False
    print('Seaborn is not available. Some visualizations will be skipped.')

try:
    import matplotlib.pyplot as plt
    matplotlib_available = True
except ImportError:
    matplotlib_available = False
    print('Matplotlib is not available. Some visualizations will be skipped.')

from sklearn.datasets import fetch_openml

# Load the Boston house price dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame

# Distinguish column types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Summary statistics for numeric columns
numeric_summary = df[numeric_cols].describe()

# Correlation matrix
correlation_matrix = df.corr()

# Visualizations
if matplotlib_available:
    plt.figure(figsize=(12, 8))
    if seaborn_available:
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    else:
        plt.matshow(correlation_matrix, fignum=1)
        plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
        plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
        plt.colorbar()
    plt.title('Correlation Matrix')
    plt.show()

    # Pairplot for selected features
    selected_features = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
    if seaborn_available:
        sns.pairplot(df[selected_features])
    else:
        pd.plotting.scatter_matrix(df[selected_features], figsize=(12, 12))
    plt.show()

    # Distribution of the target variable
    plt.figure(figsize=(8, 6))
    if seaborn_available:
        sns.histplot(df['MEDV'], kde=True)
    else:
        df['MEDV'].hist(bins=30, edgecolor='black')
    plt.title('Distribution of House Prices')
    plt.xlabel('Price')
    plt.ylabel('Frequency')
    plt.show()

    # Boxplot for numeric features
    plt.figure(figsize=(15, 10))
    df[numeric_cols].boxplot()
    plt.xticks(rotation=90)
    plt.title('Boxplot for Numeric Features')
    plt.show()

# Output the summary statistics and correlation matrix
numeric_summary, correlation_matrix

Seaborn is not available. Some visualizations will be skipped.
Matplotlib is not available. Some visualizations will be skipped.


  warn(


(             CRIM          ZN       INDUS         NOX          RM         AGE  \
 count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
 mean     3.613524   11.363636   11.136779    0.554695    6.284634   68.574901   
 std      8.601545   23.322453    6.860353    0.115878    0.702617   28.148861   
 min      0.006320    0.000000    0.460000    0.385000    3.561000    2.900000   
 25%      0.082045    0.000000    5.190000    0.449000    5.885500   45.025000   
 50%      0.256510    0.000000    9.690000    0.538000    6.208500   77.500000   
 75%      3.677083   12.500000   18.100000    0.624000    6.623500   94.075000   
 max     88.976200  100.000000   27.740000    0.871000    8.780000  100.000000   
 
               DIS         TAX     PTRATIO           B       LSTAT        MEDV  
 count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000  
 mean     3.795043  408.237154   18.455534  356.674032   12.653063   22.532806  
 std      2.10571

In [4]:
from metagpt.tools.libs.data_preprocess import get_column_info

column_info = get_column_info(df)
print("column_info")
print(column_info)


2024-08-29 21:42:23.646 | INFO     | metagpt.const:get_metagpt_package_root:21 - Package root set to /Users/tuozhou/Desktop/RA/SZRI/ChatPilot


column_info
{'Category': [], 'Numeric': ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'], 'Datetime': [], 'Others': ['CHAS', 'RAD']}


In [5]:
# Import necessary tools
from metagpt.tools.libs.data_preprocess import StandardScale
from metagpt.tools.libs.feature_engineering import GeneralSelection

# Copy the DataFrame before processing
df_copy = df.copy()

# Initialize the GeneralSelection tool to drop NaN features and features with only one unique value
general_selection = GeneralSelection(label_col='MEDV')
df_processed = general_selection.fit_transform(df_copy)

# Initialize the StandardScale tool to standardize numeric features
numeric_features = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
standard_scaler = StandardScale(features=numeric_features)
df_processed = standard_scaler.fit_transform(df_processed)

# Display the processed DataFrame
df_processed.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,-0.419782,0.28483,-1.287909,0,-0.144217,0.413672,-0.120013,0.140214,1,-0.666608,-1.459,0.441052,-1.075562,24.0
1,-0.417339,-0.487722,-0.593381,0,-0.740262,0.194274,0.367166,0.55716,2,-0.987329,-0.303094,0.441052,-0.492439,21.6
2,-0.417342,-0.487722,-0.593381,0,-0.740262,1.282714,-0.265812,0.55716,2,-0.987329,-0.303094,0.396427,-1.208727,34.7
3,-0.41675,-0.487722,-1.306878,0,-0.835284,1.016303,-0.809889,1.077737,3,-1.106115,0.113032,0.416163,-1.361517,33.4
4,-0.412482,-0.487722,-1.306878,0,-0.835284,1.228577,-0.51118,1.077737,3,-1.106115,0.113032,0.441052,-1.026501,36.2


In [6]:
from metagpt.tools.libs.data_preprocess import get_column_info

column_info = get_column_info(df_processed)
print("column_info")
print(column_info)


column_info
{'Category': [], 'Numeric': ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'], 'Datetime': [], 'Others': ['CHAS', 'RAD']}


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X = df_processed.drop(columns=['MEDV'])
y = df_processed['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2: {train_r2}")
print(f"Testing R2: {test_r2}")


TypeError: can't multiply sequence by non-int of type 'float'

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features
categorical_cols = ['CHAS', 'RAD']
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(df_processed[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded features
df_processed = df_processed.drop(columns=categorical_cols)
df_processed = pd.concat([df_processed, encoded_df], axis=1)

# Split the data into training and testing sets
X = df_processed.drop(columns=['MEDV'])
y = df_processed['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2: {train_r2}")
print(f"Testing R2: {test_r2}")

Training MSE: 20.604025365288944
Testing MSE: 24.818442738481195
Training R2: 0.7628270050876917
Testing R2: 0.6615687658684825




In [9]:
# Evaluate the trained linear regression model

# Plotting the residuals to check for patterns
if matplotlib_available:
    residuals = y_test - y_pred_test
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred_test, residuals)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Predicted Values')
    plt.show()

    # Plotting histogram of residuals
    plt.figure(figsize=(8, 6))
    if seaborn_available:
        sns.histplot(residuals, kde=True)
    else:
        plt.hist(residuals, bins=30, edgecolor='black')
    plt.title('Distribution of Residuals')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.show()

# Evaluate the performance on the test set
evaluation_summary = {
    "Training MSE": train_mse,
    "Testing MSE": test_mse,
    "Training R2": train_r2,
    "Testing R2": test_r2
}
evaluation_summary


{'Training MSE': 20.604025365288944,
 'Testing MSE': 24.818442738481195,
 'Training R2': 0.7628270050876917,
 'Testing R2': 0.6615687658684825}