In [7]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#创建模拟数据
#设置随机种子以获得可复现的结果
np.random.seed(0)
X=np.random.rand(100,1)
y=3*X.squeeze()+2+np.random.randn(100)*0.5  # 生成目标变量，3*X + 2 + 噪声 

#划分训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#创建线性回归模型
model=LinearRegression()

#拟合模拟
model.fit(X_train,y_train)

#进行预测
y_pred=model.predict(X_test)

#评估模型
mse=mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error:{mse}")

#使用模型进行新数据点的预测
new_data=np.array([[0.5]]).reshape(-1,1)
new_prediction=model.predict(new_data)

print(f"Prediction for new data point {new_data}:{new_prediction}")

Mean Squared Error:0.22943831174285728
Prediction for new data point [[0.5]]:[3.59829964]


In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV,cross_val_score,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#创建虚构数据集
np.random.seed(42)
X=np.random.rand(100,3)
y=2*X[:,0]+3*X[:,1]+4+np.random.randn(100)*0.5 #添加一些噪声
data=pd.DataFrame(X,columns=['feature1','feature2','feature3'])
data['target']=y

#特征工程,处理缺失值和特征缩放
numeric_transformer=SimpleImputer(strategy='mean')  #使用均值填充缺失值
preprocessor=ColumnTransformer(transformers=[('num',numeric_transformer,['feature1','feature2','feature3'])],remainder='passthrough')

#创建带有预处理步骤的管道
pipeline=Pipeline(steps=[('preprocessor',preprocessor),
                         ('scaler',StandardScaler()),
                         ('model',LinearRegression())])

#模型调优
param_grid={'model__fit__intercept':[True,False],'model__normalize':[True,False]}
grid_search=GridSearchCV(pipeline,param_grid,cv=5)

grid_search.fit(data.drop('target', axis=1), data['target'])  
best_pipeline=grid_search.best_estimator_

# 交叉验证  
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  
scores = cross_val_score(best_pipeline, data.drop('target', axis=1), data['target'], cv=kfold)  
print("Cross-validation scores:", scores)  
print("Mean cross-validation score:", np.mean(scores))  

# 评估指标  
y_true = data['target']  
y_pred = best_pipeline.predict(data.drop('target', axis=1))  
mse = mean_squared_error(y_true, y_pred)  
mae = mean_absolute_error(y_true, y_pred)  
r2 = r2_score(y_true, y_pred)  
print("MSE:", mse)  
print("MAE:", mae)  
print("R-squared:", r2)

ValueError: Invalid parameter 'fit' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].