In [3]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('processes2.csv')
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [6]:
X=df.drop(columns=['selling_price'])
Y=df['selling_price']


num_cols=X.select_dtypes(include=['float64','int64']).columns
cat_cols=X.select_dtypes(include=['object']).columns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [7]:


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]
)
X_transformed = preprocessor.fit_transform(X)

onehot_columns = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
all_columns = list(num_cols) + list(onehot_columns)

X = pd.DataFrame(X_transformed, columns=all_columns)


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=75)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
model = RandomForestRegressor(n_estimators=100,random_state=42)
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test,Y_pred)
r2 = r2_score(Y_test,Y_pred)

In [10]:
print(f"mean_squared_error:{mse}")
print(f"r2_score: {r2}")

mean_squared_error:5676928162.631129
r2_score: 0.8902467093900217
