In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [50]:
df = pd.read_csv('acne_risk_dataset.csv')

In [51]:
df.head()

Unnamed: 0,age,gender,diet,sleep_hours,water_intake_liters,smoking_or_vaping,acne_risk,weight_kg
0,33,male,unhealthy,7.4,2.4,no,0,55.0
1,23,female,healthy,8.0,0.8,no,0,49.2
2,18,male,unhealthy,4.6,3.2,no,1,70.0
3,34,male,healthy,7.6,2.0,yes,0,73.9
4,34,male,unhealthy,6.5,1.4,yes,1,68.1


In [52]:
X = df.drop('acne_risk', axis=1)
y = df['acne_risk']

In [53]:
categorical_data = ["gender", "diet", "smoking_or_vaping"] 
numerical_data = ["age", "sleep_hours", "water_intake_liters", "weight_kg"]

In [54]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_data),
        ("num", "passthrough", numerical_data)
    ]
)

In [55]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

In [57]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.6

In [59]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)