In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load your dataset
# Ensure your CSV has columns like: 'Oil_Content', 'Surfactant_Conc', 'HLB', 'Sonication_Time', 'Droplet_Size'
df = pd.read_csv('/content/sample_data/california_housing_test.csv')

# 2. Define Features (X) and Target (y)
# Adjust these strings to match your exact CSV header names
features = ['Oil_Content', 'Surfactant_Conc', 'HLB', 'Sonication_Time']
target = 'Droplet_Size'

X = df[features]
y = df[target]

# 3. Split the data into Training and Testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Initialize and Train the Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Make Predictions
y_pred = model.predict(X_test)

# 6. Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# 7. Visualize Feature Importance
importances = model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 6))
plt.title('Impact of Formulation Factors on Droplet Size')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()