In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Example dataset with a text field (e.g. 'description') and a numeric target (e.g. 'price')
data = {
    'description': ['Great product with fast shipping!', 'Not worth the money', 'Excellent condition, highly recommended'],
    'other_feature': [10, 15, 12],
    'price': [100, 50, 200]
}
df = pd.DataFrame(data)

# Split into features and target
X = df[['description', 'other_feature']]
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer with TF-IDF for text and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(), 'description'),    # Vectorize the 'description' column
        ('scaler', StandardScaler(), ['other_feature'])  # Scale the numerical 'other_feature' column
    ])

# Define a pipeline with preprocessing and regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())  # Using Ridge regression
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')


Mean Squared Error: 12656.25
