In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
import numpy as np
import pandas as pd
# Parameters
num_samples = 1000
appliance_types = ['Washing Machine', 'Refrigerator', 'Microwave', 'Dishwasher']
brands = ['BrandA', 'BrandB', 'BrandC']
problem_types = ['Leakage', 'Not Cooling', 'Not Heating', 'Door Issue']  
# The datasets are not availble so it has been created to show the prototype working model.

# Generate synthetic data
np.random.seed(42)
data = {
    'appliance_type': np.random.choice(appliance_types, num_samples),
    'brand': np.random.choice(brands, num_samples),
    'age': np.random.randint(1, 15, num_samples),
    'problem_type': np.random.choice(problem_types, num_samples),
    'repair_cost': np.random.randint(50, 500, num_samples)
}

# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())


    appliance_type   brand  age problem_type  repair_cost
0        Microwave  BrandB   12   Door Issue           79
1       Dishwasher  BrandC    5   Door Issue          383
2  Washing Machine  BrandA   13   Door Issue          133
3        Microwave  BrandA    1   Door Issue          418
4        Microwave  BrandA    2      Leakage          330


In [24]:
# Handle missing values
df = df.dropna()

# Convert categorical columns to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)


In [25]:
df

Unnamed: 0,age,repair_cost,appliance_type_Microwave,appliance_type_Refrigerator,appliance_type_Washing Machine,brand_BrandB,brand_BrandC,problem_type_Leakage,problem_type_Not Cooling,problem_type_Not Heating
0,12,79,True,False,False,True,False,False,False,False
1,5,383,False,False,False,False,True,False,False,False
2,13,133,False,False,True,False,False,False,False,False
3,1,418,True,False,False,False,False,False,False,False
4,2,330,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
995,9,483,False,False,True,True,False,True,False,False
996,4,471,False,False,True,True,False,False,False,False
997,13,460,False,False,False,True,False,False,True,False
998,12,275,False,False,False,False,False,False,False,True


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Split the data into features and target
X = df.drop('repair_cost', axis=1)
y = df['repair_cost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Initialize the models
lr = LinearRegression()
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)

# Train the models
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)


In [28]:
# Predict on the test set
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)

# Evaluate the models
lr_mae = mean_absolute_error(y_test, lr_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)
gb_mae = mean_absolute_error(y_test, gb_pred)

print(f'Linear Regression MAE: {lr_mae}')
print(f'Random Forest MAE: {rf_mae}')
print(f'Gradient Boosting MAE: {gb_mae}')


Linear Regression MAE: 115.94969603522627
Random Forest MAE: 124.79346401390278
Gradient Boosting MAE: 118.93785363985546


In [29]:
import pandas as pd

# Use the best model to make predictions on new data
best_model = gb  # Assuming Gradient Boosting performed the best

# Example new data
new_data = pd.DataFrame({
    'appliance_type': ['Washing Machine'],
    'brand': ['BrandX'],
    'age': [2],
    'problem_type': ['Leakage'],
    # Add other features as required
})

# Preprocess new data similarly to training data
new_data = pd.get_dummies(new_data, drop_first=True)

# Ensure the new data has the same columns as training data
missing_cols = set(X.columns) - set(new_data.columns)
for col in missing_cols:
    new_data[col] = 0

# Reorder the columns to match the training data
new_data = new_data[X.columns]

# Predict the repair cost
predicted_cost = best_model.predict(new_data)
print(f'Predicted Repair Cost: {predicted_cost[0]}')


Predicted Repair Cost: 246.70212914239886
