In [563]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [564]:
train=pd.read_csv("/kaggle/input/mse2-makeup/train.csv")
test=pd.read_csv("/kaggle/input/mse2-makeup/test.csv")

In [565]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7716 entries, 0 to 7715
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   size (cm)      7181 non-null   float64
 1   shape          6943 non-null   object 
 2   weight (g)     7032 non-null   float64
 3   avg_price (₹)  7104 non-null   float64
 4   color          7283 non-null   object 
 5   taste          7257 non-null   object 
 6   fruit_name     7316 non-null   object 
dtypes: float64(3), object(4)
memory usage: 422.1+ KB


In [566]:
train.isnull().sum()

size (cm)        535
shape            773
weight (g)       684
avg_price (₹)    612
color            433
taste            459
fruit_name       400
dtype: int64

In [567]:
test.isnull().sum()

id               0
size (cm)        0
shape            0
weight (g)       0
avg_price (₹)    0
color            0
taste            0
dtype: int64

In [568]:
test_id=test["id"]
test=test.drop(columns=["id"])

In [569]:
train = train.dropna(subset=['fruit_name'])

In [570]:
train['fruit_name'].shape[0]


7316

In [571]:
train.isnull().sum()

size (cm)        510
shape            738
weight (g)       651
avg_price (₹)    584
color            413
taste            441
fruit_name         0
dtype: int64

In [572]:
X=train.drop(columns=["fruit_name"])
y=train["fruit_name"]

In [573]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [574]:
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

In [575]:
numerical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [576]:
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

In [577]:
model=RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

In [578]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('model',model)
])

In [579]:

# le = LabelEncoder()
# y_train_enc = le.fit_transform(y_train)  # fit on train
# y_test_enc = le.transform(y_test)        # transform test

In [580]:
# pipeline.fit(X_train,y_train_enc)
pipeline.fit(X_train,y_train)

In [581]:
# y_proba=pipeline.predict_proba(X_test)
y_pred=pipeline.predict(X_test)

In [582]:
# loss = log_loss(y_test_enc, y_proba)
# print("Log Loss:", loss)
accu=accuracy_score(y_test,y_pred)
print(accu)

0.9938524590163934


In [583]:
# y_final=pipeline.predict_proba(test)
y_final=pipeline.predict(test)

In [584]:
y_final

array(['pomegranate', 'apple', 'pomegranate', ..., 'apple', 'plum',
       'custard apple'], dtype=object)

In [585]:
# class_names = le.classes_  # use label encoder mapping
# submission = pd.DataFrame(y_final, columns=[f"Status_{cls}" for cls in class_names])
# submission.insert(0, 'id', test_id)
# submission.to_csv("submission4.csv", index=False)
# print("\n✅ Submission file created successfully!")
# print(submission.head())
submission=pd.DataFrame({
    'id':test_id,
    'fruit_name':y_final
})
submission.to_csv("submission_final.csv",index=False)