In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

regressors = [
    LinearRegression(),
    KNeighborsRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
]

df = pd.read_csv('/Users/rohith/Developer/rguhackcode/rguhack/project/mylastdata.csv')

df.dropna(subset=['price'] + ['mileage', 'mpg', 'tax', 'Fuel_Cost_5_years', 'Taxes_Cost_5_years'], inplace=True)

features = ['mileage', 'mpg', 'tax', 'Fuel_Cost_5_years', 'Taxes_Cost_5_years']
target = 'price'

X = df[features]
y = df[target]


In [54]:
# One-hot encoding the 'fuelType', 'make', 'model', and 'transmission' columns
df_encoded = pd.get_dummies(df, columns=['make', 'model', 'transmission', 'fuelType'])

# Now df_encoded has the original columns, plus the one-hot encoded columns
# There is no need to concatenate df with dff or apply LabelEncoder on the one-hot encoded dataframe

# You can check the new dataframe to ensure the encoding is correct
print(df_encoded.head())



   X  year  mileage  engineSize   mpg  tax  price  Taxes_Cost_5_years  \
0  0  2017    15735         1.4  55.4  150  12500                 750   
1  1  2016    36203         2.0  64.2   20  16500                 100   
2  2  2016    29946         1.4  55.4   30  11000                 150   
3  3  2017    25952         2.0  67.3  145  16800                 725   
4  4  2019     1998         1.0  49.6  145  17300                 725   

   Fuel_Cost_5_years  Total_Cost_5_years  ...  model_230  \
0          3343944.0           3357194.0  ...      False   
1          3875112.0           3891712.0  ...      False   
2          3343944.0           3355094.0  ...      False   
3          4062228.0           4079753.0  ...      False   
4          2993856.0           3011881.0  ...      False   

   transmission_Automatic  transmission_Manual  transmission_Other  \
0                   False                 True               False   
1                    True                False              

In [55]:


# Create a label_encoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['make'] = label_encoder.fit_transform(df['make'])
df['model'] = label_encoder.fit_transform(df['model'])
df['transmission'] = label_encoder.fit_transform(df['transmission'])
df



Unnamed: 0,X,make,model,year,fuelType,mileage,engineSize,transmission,mpg,tax,price,Taxes_Cost_5_years,Fuel_Cost_5_years,Total_Cost_5_years
0,0,0,9,2017,Petrol,15735,1.4,1,55.4,150,12500,750,3343944.0,3357194.0
1,1,0,14,2016,Diesel,36203,2.0,0,64.2,20,16500,100,3875112.0,3891712.0
2,2,0,9,2016,Petrol,29946,1.4,1,55.4,30,11000,150,3343944.0,3355094.0
3,3,0,12,2017,Diesel,25952,2.0,0,67.3,145,16800,725,4062228.0,4079753.0
4,4,0,11,2019,Petrol,1998,1.0,1,49.6,145,17300,725,2993856.0,3011881.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92846,99182,8,56,2012,Diesel,74000,2.0,1,58.9,125,5990,625,3555204.0,3561819.0
92847,99183,8,61,2008,Petrol,88102,1.2,1,46.3,145,1799,725,2794668.0,2797192.0
92848,99184,8,61,2009,Petrol,70000,1.4,1,42.0,200,1590,1000,2535120.0,2537710.0
92849,99185,8,61,2006,Petrol,82704,1.2,1,46.3,150,1250,750,2794668.0,2796668.0


In [56]:
from sklearn.model_selection import train_test_split


# Assuming 'target' is the name of your target variable
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

# Splitting the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.6, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, train_size=0.5, random_state=0)

print(y_train.shape)
print(y_valid.shape)
print(X_test.shape)


(55710,)
(18570,)
(18571, 221)


In [57]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

head = 7  # Number of models to evaluate

for model in regressors[:head]:
    model.fit(X_train, y_train)  # Train the model on the training data
    y_pred_valid = model.predict(X_valid)  # Predict on the validation data
    y_pred_train = model.predict(X_train)  # Predict on the training data for comparison

    # Print the model and various evaluation metrics
    print(model)
    print("\tExplained variance train:", explained_variance_score(y_train, y_pred_train))
    print("\tExplained variance valid:", explained_variance_score(y_valid, y_pred_valid))
    print()
    print("\tMean absolute error:", mean_absolute_error(y_valid, y_pred_valid))
    print('\tMean Squared Error:', mean_squared_error(y_valid, y_pred_valid))
    print("\tR2 score:", r2_score(y_valid, y_pred_valid))
    print()

LinearRegression()
	Explained variance train: 1.0
	Explained variance valid: 1.0

	Mean absolute error: 6.928328181255147e-10
	Mean Squared Error: 8.769877840726894e-19
	R2 score: 1.0

KNeighborsRegressor()
	Explained variance train: 0.9814641097131427
	Explained variance valid: 0.9700736869070823

	Mean absolute error: 905.0090576198169
	Mean Squared Error: 2880323.012555735
	R2 score: 0.9700562765790602

RandomForestRegressor()
	Explained variance train: 0.996278702986255
	Explained variance valid: 0.9728076760813765

	Mean absolute error: 746.617042541734
	Mean Squared Error: 2615794.6582318903
	R2 score: 0.9728063028241518

DecisionTreeRegressor()
	Explained variance train: 1.0
	Explained variance valid: 0.950152565328033

	Mean absolute error: 915.5221324717286
	Mean Squared Error: 4794886.706300485
	R2 score: 0.9501525486057185



In [59]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Assuming 'regressors' is the list of models you want to cross-validate
for model in regressors:
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    print(f"Model: {model.__class__.__name__}")
    print(f"RMSE scores: {rmse_scores}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print()


Model: LinearRegression
RMSE scores: [9.34067897e-10 1.56183727e-09 1.72534122e-09 1.09595615e-09
 6.63000610e-10]
Average RMSE: 1.1960406288825543e-09

Model: KNeighborsRegressor
RMSE scores: [6868.2382033  4050.52548061 7206.76560998 3710.99985573 6265.62754653]
Average RMSE: 5620.431339228201

Model: RandomForestRegressor
RMSE scores: [4999.77609013 3321.54467691 6485.68923218 3238.71496648 3593.81433222]
Average RMSE: 4327.907859585115

Model: DecisionTreeRegressor
RMSE scores: [5806.60290169 4269.08551177 7259.20034746 3482.65556954 4515.87598535]
Average RMSE: 5066.684063160976



In [64]:
## car logic
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

In [None]:
#car_type = input car brand name 
#runs ml model to get recomendations    
def predict_for_car_type(car_type):
    selected_cars = df[df['make'] == car_type].copy()
    if selected_cars.empty:
        return pd.DataFrame()
    
    usable_cars = selected_cars[selected_cars['mileage'] <= 8000 - (5 * selected_cars['mpg'])].copy()
    if usable_cars.empty:
        return pd.DataFrame()
    
    predicted_prices = xg_reg.predict(usable_cars[features])
    usable_cars.loc[:, 'predicted_price'] = predicted_prices
    return usable_cars.nsmallest(3, 'predicted_price')


def predict_and_compare(car_type):
    specified_cars = predict_for_car_type(car_type)
    if specified_cars.empty:
        print("No usable cars found for the specified type that are estimated to last for the next 5 years.")
        return
    
    print("Cheapest usable cars for", car_type + ":")
    print(specified_cars[['make', 'model', 'predicted_price']])
    
    best_alternatives = pd.DataFrame(columns=specified_cars.columns)
    for other_car_type in df['make'].unique():
        if other_car_type != car_type:
            alternatives = predict_for_car_type(other_car_type)
            if not alternatives.empty:
                cheapest_alternative = alternatives.nsmallest(1, 'predicted_price')
                best_alternatives = pd.concat([best_alternatives, cheapest_alternative])
    
    if not best_alternatives.empty:
        print("\nBest alternatives from other car types:")
        print(best_alternatives[['make', 'model', 'predicted_price']])
    else:
        print("\nNo better alternatives found from other types.")

car_type = input("Enter the type of car (e.g., audi, bmw, mercedes): ")

predict_and_compare(car_type)