In [20]:
#Step 1 
# Basic data handling
import pandas as pd
import numpy as np

# Modeling and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [21]:
#Step 2 
# Load your dataset (replace 'your_file.csv' with your file path)
df = pd.read_csv('combined_output.csv')

In [22]:
# Step 3 - Clean and impute missing values
columns_to_keep = ['Name', 'Age', 'Gender', 'Address', 'Brand', 'Frame Shape']
df = df[columns_to_keep]

# Fill missing Age with random value from min to max
if df['Age'].isna().sum() > 0:
    min_age = int(df['Age'].min())
    max_age = int(df['Age'].max())
    df['Age'] = df['Age'].apply(lambda x: np.random.randint(min_age, max_age + 1) if pd.isna(x) else x)

# Function to randomly impute missing categorical values with existing non-null values
def random_fillna(series):
    non_null_values = series.dropna().values
    return series.apply(lambda x: np.random.choice(non_null_values) if pd.isna(x) else x)

# Apply random imputation for each categorical/text column
df['Gender'] = random_fillna(df['Gender'])
df['Address'] = random_fillna(df['Address'])
df['Brand'] = random_fillna(df['Brand'])
df['Frame Shape'] = random_fillna(df['Frame Shape'])

# Convert Age to integer (ensures dtype consistency)
df['Age'] = df['Age'].astype(int)

# Strip leading/trailing whitespaces
df['Gender'] = df['Gender'].str.strip()
df['Address'] = df['Address'].str.strip()
df['Brand'] = df['Brand'].str.strip()
df['Frame Shape'] = df['Frame Shape'].str.strip()


In [23]:
#Step 4 
# Features and targets
X = df[['Age', 'Gender', 'Address']]
y_brand = df['Brand']
y_frame = df['Frame Shape']

# Label encode targets
brand_le = LabelEncoder()
frame_le = LabelEncoder()

y_brand_enc = brand_le.fit_transform(y_brand)
y_frame_enc = frame_le.fit_transform(y_frame)

# Split the data
X_train, X_test, yb_train, yb_test = train_test_split(X, y_brand_enc, test_size=0.2, random_state=42)
_, _, yf_train, yf_test = train_test_split(X, y_frame_enc, test_size=0.2, random_state=42)



In [24]:
#Step 5
# Columns for transformation
categorical_cols = ['Gender', 'Address']
numeric_cols = ['Age']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Pipelines
brand_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=50,max_depth=10,min_samples_leaf=5,random_state=42))
])

frame_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [25]:
#Step 6 
# Train both models
brand_model.fit(X_train, yb_train)
frame_model.fit(X_train, yf_train)

In [26]:
#Step 7
# Predict
yb_pred = brand_model.predict(X_test)
yf_pred = frame_model.predict(X_test)

# Accuracy
print("Brand Accuracy:", accuracy_score(yb_test, yb_pred))
print("Frame Shape Accuracy:", accuracy_score(yf_test, yf_pred))

# Reports
print("\nBrand Report:\n", classification_report(yb_test, yb_pred))
print("\nFrame Shape Report:\n", classification_report(yf_test, yf_pred))

Brand Accuracy: 0.11555555555555555
Frame Shape Accuracy: 0.17333333333333334

Brand Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         2
          31       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         5
          36       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         1
          40       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
#Step 8
joblib.dump(brand_model, 'brand_model.pkl')
joblib.dump(frame_model, 'frame_model.pkl')
joblib.dump(brand_le, 'brand_encoder.pkl')
joblib.dump(frame_le, 'frame_encoder.pkl')

['frame_encoder.pkl']

In [30]:
#Step 9
def suggest_brand_frame(age, gender, address):
    # Load models if not in memory
    brand_model = joblib.load('brand_model.pkl')
    frame_model = joblib.load('frame_model.pkl')
    brand_le = joblib.load('brand_encoder.pkl')
    frame_le = joblib.load('frame_encoder.pkl')

    # Input
    df_input = pd.DataFrame([[age, gender.strip(), address.strip()]], columns=['Age', 'Gender', 'Address'])

    # Predict
    pred_brand = brand_le.inverse_transform(brand_model.predict(df_input))[0]
    pred_frame = frame_le.inverse_transform(frame_model.predict(df_input))[0]

    return pred_brand, pred_frame

# Example
suggest_brand_frame(20, 'M', 'Sonepur')

('Fastrack', 'Wafare')