In [6]:
import pandas as pd
import sqlite3
import pickle
import category_encoders as ce 
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [7]:
df = pd.read_csv("CARS_1.csv")
df.head(2)

Unnamed: 0,car_name,reviews_count,fuel_type,engine_displacement,no_cylinder,seating_capacity,transmission_type,fuel_tank_capacity,body_type,rating,starting_price,ending_price,max_torque_nm,max_torque_rpm,max_power_bhp,max_power_rp
0,Maruti Alto K10,51,Petrol,998,3,5.0,Automatic,27.0,Hatchback,4.5,399000,583000,89.0,3500,65.71,5500
1,Maruti Brezza,86,Petrol,1462,4,5.0,Automatic,48.0,SUV,4.5,799000,1396000,136.8,4400,101.65,6000


In [8]:
df.dtypes

car_name                object
reviews_count            int64
fuel_type               object
engine_displacement      int64
no_cylinder              int64
seating_capacity       float64
transmission_type       object
fuel_tank_capacity     float64
body_type               object
rating                 float64
starting_price           int64
ending_price             int64
max_torque_nm          float64
max_torque_rpm           int64
max_power_bhp          float64
max_power_rp             int64
dtype: object

In [9]:
print(df.isnull().sum())

car_name               0
reviews_count          0
fuel_type              0
engine_displacement    0
no_cylinder            0
seating_capacity       1
transmission_type      0
fuel_tank_capacity     0
body_type              0
rating                 0
starting_price         0
ending_price           0
max_torque_nm          0
max_torque_rpm         0
max_power_bhp          0
max_power_rp           0
dtype: int64


In [10]:
df.dropna(inplace=True)
df.isnull().sum()

car_name               0
reviews_count          0
fuel_type              0
engine_displacement    0
no_cylinder            0
seating_capacity       0
transmission_type      0
fuel_tank_capacity     0
body_type              0
rating                 0
starting_price         0
ending_price           0
max_torque_nm          0
max_torque_rpm         0
max_power_bhp          0
max_power_rp           0
dtype: int64

In [11]:
df.seating_capacity = df.seating_capacity.astype(int)

In [15]:
for c in df.columns:
    print(c)
    print(df[c].unique())

car_name
['Maruti Alto K10' 'Maruti Brezza' 'Mahindra Thar' 'Mahindra XUV700'
 'Mahindra Scorpio-N' 'Toyota Fortuner' 'Hyundai Creta' 'Tata Punch'
 'Mahindra Bolero' 'Maruti Swift' 'Tata Nexon' 'Hyundai Venue'
 'Kia Seltos' 'Hyundai Tucson' 'Tata Harrier' 'Maruti Baleno'
 'Toyota Innova Crysta' 'Maruti Ertiga' 'Kia Sonet' 'Maruti Dzire'
 'Honda City 4th Generation' 'Land Rover Range Rover' 'Tata Tiago'
 'Hyundai i20' 'Mahindra XUV300' 'MG Hector' 'Tata Altroz'
 'Mahindra Scorpio Classic' 'Hyundai Verna' 'Citroen C3' 'Honda City'
 'Maruti Celerio' 'Nissan Magnite' 'Renault KWID' 'Land Rover Defender'
 'Maruti Alto 800' 'Hyundai Kona Electric' 'Tata Tigor' 'Renault Kiger'
 'Maruti S-Presso' 'MG Astor' 'Lamborghini Urus' 'Honda Amaze'
 'Renault Triber' 'Hyundai Alcazar' 'Hyundai Aura' 'Volkswagen Virtus'
 'Skoda Slavia' 'Maruti Ignis' 'Mercedes-Benz GLA' 'Volvo XC90'
 'Skoda Kushaq' 'Maruti XL6' 'Jeep Wrangler' 'Maruti Eeco' 'Force Gurkha'
 'Maruti S-Cross' 'Maruti Ciaz' 'Hyundai Santro' 

In [14]:
df['brand'] = df['car_name'].apply(lambda x: x.split()[0])
df.brand.unique()

array(['Maruti', 'Mahindra', 'Toyota', 'Hyundai', 'Tata', 'Kia', 'Honda',
       'Land', 'MG', 'Citroen', 'Nissan', 'Renault', 'Lamborghini',
       'Volkswagen', 'Skoda', 'Mercedes-Benz', 'Volvo', 'Jeep', 'Force',
       'BMW', 'Audi', 'Rolls-Royce', 'Bajaj', 'Ferrari', 'Jaguar',
       'Datsun', 'Isuzu', 'Porsche', 'BYD', 'Mini', 'Lexus', 'Bentley',
       'Maserati', 'Aston', 'Rolls', 'Strom', 'Compass'], dtype=object)

In [16]:
df['average_price'] = round((df['starting_price']+df['ending_price'])/2,2)

In [20]:
mean_engine_displacement = df.engine_displacement.mean()
mean_tank_capacity = df.fuel_tank_capacity.mean()
df.fuel_tank_capacity = df.fuel_tank_capacity.replace(0, mean_tank_capacity)
df.engine_displacement = df.engine_displacement.replace(0, mean_engine_displacement)

In [21]:
df['fuel_efficiency'] = df['fuel_tank_capacity'] / df['engine_displacement']
# mean_fuel_efficiency = df['fuel_efficiency'].mean()
# df['fuel_efficiency'].fillna(mean_fuel_efficiency, inplace=True)
df.fuel_efficiency.unique()


array([0.02705411, 0.03283174, 0.0260989 , 0.02729754, 0.02593267,
       0.02903811, 0.03348962, 0.03085905, 0.0400534 , 0.03091061,
       0.02939212, 0.04509018, 0.02704056, 0.02556237, 0.0229837 ,
       0.03077975, 0.03014066, 0.02672011, 0.02060911, 0.0500417 ,
       0.03707415, 0.02805611, 0.03067485, 0.0247161 , 0.02747253,
       0.02502085, 0.02670227, 0.03206413, 0.04004004, 0.02802803,
       0.01781069, 0.07537688, 0.0200281 , 0.03335804, 0.01876877,
       0.02336449, 0.03004005, 0.0267335 , 0.02359787, 0.0345353 ,
       0.03337784, 0.02303095, 0.05434783, 0.0242681 , 0.02941176,
       0.05524862, 0.02672903, 0.01536422, 0.02325581, 0.03757515,
       0.01385042, 0.02556391, 0.01534885, 0.01842107, 0.03024194,
       0.00681716, 0.02520161, 0.03326613, 0.03679435, 0.02742509,
       0.03047232, 0.03336113, 0.04106159, 0.03503504, 0.02923387,
       0.02702703, 0.0173449 , 0.02952953, 0.03759398, 0.03006012,
       0.02308873, 0.02768512, 0.03245248, 0.16203704, 0.05505

In [22]:
conn = sqlite3.connect('instance/majordb.db')
df.to_sql('cars', conn, if_exists='replace', index=False)
conn.commit()
conn.close()

fetch data


In [23]:
conn = sqlite3.connect('instance/majordb.db')

fetched_data = pd.read_sql("SELECT * FROM cars", conn)
conn.close()
fetched_data.head(2)

Unnamed: 0,car_name,reviews_count,fuel_type,engine_displacement,no_cylinder,seating_capacity,transmission_type,fuel_tank_capacity,body_type,rating,starting_price,ending_price,max_torque_nm,max_torque_rpm,max_power_bhp,max_power_rp,brand,average_price,fuel_efficiency
0,Maruti Alto K10,51,Petrol,998.0,3,5,Automatic,27.0,Hatchback,4.5,399000,583000,89.0,3500,65.71,5500,Maruti,491000.0,0.027054
1,Maruti Brezza,86,Petrol,1462.0,4,5,Automatic,48.0,SUV,4.5,799000,1396000,136.8,4400,101.65,6000,Maruti,1097500.0,0.032832


In [24]:
for c in fetched_data.columns:
    print(c)
    print(fetched_data[c].unique())

car_name
['Maruti Alto K10' 'Maruti Brezza' 'Mahindra Thar' 'Mahindra XUV700'
 'Mahindra Scorpio-N' 'Toyota Fortuner' 'Hyundai Creta' 'Tata Punch'
 'Mahindra Bolero' 'Maruti Swift' 'Tata Nexon' 'Hyundai Venue'
 'Kia Seltos' 'Hyundai Tucson' 'Tata Harrier' 'Maruti Baleno'
 'Toyota Innova Crysta' 'Maruti Ertiga' 'Kia Sonet' 'Maruti Dzire'
 'Honda City 4th Generation' 'Land Rover Range Rover' 'Tata Tiago'
 'Hyundai i20' 'Mahindra XUV300' 'MG Hector' 'Tata Altroz'
 'Mahindra Scorpio Classic' 'Hyundai Verna' 'Citroen C3' 'Honda City'
 'Maruti Celerio' 'Nissan Magnite' 'Renault KWID' 'Land Rover Defender'
 'Maruti Alto 800' 'Hyundai Kona Electric' 'Tata Tigor' 'Renault Kiger'
 'Maruti S-Presso' 'MG Astor' 'Lamborghini Urus' 'Honda Amaze'
 'Renault Triber' 'Hyundai Alcazar' 'Hyundai Aura' 'Volkswagen Virtus'
 'Skoda Slavia' 'Maruti Ignis' 'Mercedes-Benz GLA' 'Volvo XC90'
 'Skoda Kushaq' 'Maruti XL6' 'Jeep Wrangler' 'Maruti Eeco' 'Force Gurkha'
 'Maruti S-Cross' 'Maruti Ciaz' 'Hyundai Santro' 

In [25]:
fetched_data.columns

Index(['car_name', 'reviews_count', 'fuel_type', 'engine_displacement',
       'no_cylinder', 'seating_capacity', 'transmission_type',
       'fuel_tank_capacity', 'body_type', 'rating', 'starting_price',
       'ending_price', 'max_torque_nm', 'max_torque_rpm', 'max_power_bhp',
       'max_power_rp', 'brand', 'average_price', 'fuel_efficiency'],
      dtype='object')

In [26]:
classification_dataset = fetched_data.drop(columns=['car_name'])

In [2]:
import category_encoders as ce
encoder = ce.OrdinalEncoder(mapping=[
    {'col': 'transmission_type', 'mapping': {'Automatic': 1, 'Manual': 2, 'Electric': 3}},
    {'col': 'body_type', 'mapping': {'Hatchback': 1, 'SUV': 2, 'MUV': 3, 'Sedan': 4, 'Hybrid': 5,
                                      'Minivan': 6, 'Pickup Truck': 7, 'Coupe': 8, 'Convertible': 9,
                                      'Luxury': 10, 'Wagon': 11}},
    {'col': 'brand', 'mapping': {'Maruti': 1, 'Mahindra': 2, 'Toyota': 3, 'Hyundai': 4, 'Tata': 5,
                                  'Kia': 6, 'Honda': 7, 'Land': 8, 'MG': 9, 'Citroen': 10,
                                  'Nissan': 11, 'Renault': 12, 'Lamborghini': 13, 'Volkswagen': 14,
                                  'Skoda': 15, 'Mercedes-Benz': 16, 'Volvo': 17, 'Jeep': 18, 'Force': 19,
                                  'BMW': 20, 'Audi': 21, 'Rolls-Royce': 22, 'Bajaj': 23, 'Ferrari': 24,
                                  'Jaguar': 25, 'Datsun': 26, 'Isuzu': 27, 'Porsche': 28, 'BYD': 29,
                                  'Mini': 30, 'Lexus': 31, 'Bentley': 32, 'Maserati': 33, 'Aston': 34,
                                  'Rolls': 35, 'Strom': 36, 'Compass': 37}}
])
encoder.fit(classification_dataset)
classification_dataset = encoder.transform(classification_dataset)

ModuleNotFoundError: No module named 'category_encoders'

In [18]:
y = classification_dataset['fuel_type']
X = classification_dataset.drop(columns=['fuel_type'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


decision_tree_clf = DecisionTreeClassifier()
random_forest_clf = RandomForestClassifier()
logistic_regression_clf = LogisticRegression()

decision_tree_clf.fit(X_train, y_train)
random_forest_clf.fit(X_train, y_train)
logistic_regression_clf.fit(X_train, y_train)


dt_pred = decision_tree_clf.predict(X_test)
rf_pred = random_forest_clf.predict(X_test)
lr_pred = logistic_regression_clf.predict(X_test)

dt_f1 = f1_score(y_test, dt_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
lr_f1 = f1_score(y_test, lr_pred, average='weighted')

print("Decision Tree F1 Score:", dt_f1)
print("Random Forest F1 Score:", rf_f1)
print("Logistic Regression F1 Score:", lr_f1)
X.columns

Decision Tree F1 Score: 0.8434303697875688
Random Forest F1 Score: 0.8989993746091307
Logistic Regression F1 Score: 0.37475803329461865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Index(['reviews_count', 'engine_displacement', 'no_cylinder',
       'seating_capacity', 'transmission_type', 'fuel_tank_capacity',
       'body_type', 'rating', 'starting_price', 'ending_price',
       'max_torque_nm', 'max_torque_rpm', 'max_power_bhp', 'max_power_rp',
       'brand', 'average_price', 'fuel_efficiency'],
      dtype='object')

In [19]:
pickle.dump(decision_tree_clf, open('ds1_clasi.pkl', 'wb'))

In [20]:
encoder = ce.OrdinalEncoder(mapping=[
    {'col': 'fuel_type', 'mapping': {'Petrol': 1, 'Diesel': 2, 'CNG': 3, 'Electric': 4}},
])
encoder.fit(classification_dataset)
classification_dataset = encoder.transform(classification_dataset)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

y = classification_dataset['average_price']
X = classification_dataset.drop(columns=['average_price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_y_pred = linear_model.predict(X_test)
linear_rmse = mean_squared_error(y_test, linear_y_pred, squared=False)

# Initialize and train the Random Forest Regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)

# Initialize and train the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_y_pred, squared=False)

# Print RMSE for each model
print("Linear Regression RMSE:", linear_rmse)
print("Random Forest Regression RMSE:", rf_rmse)
print("Gradient Boosting Regression RMSE:", gb_rmse)
X.columns




Linear Regression RMSE: 9.477020610985252e-09
Random Forest Regression RMSE: 1966668.789872338
Gradient Boosting Regression RMSE: 1949888.3514248678




Index(['reviews_count', 'fuel_type', 'engine_displacement', 'no_cylinder',
       'seating_capacity', 'transmission_type', 'fuel_tank_capacity',
       'body_type', 'rating', 'starting_price', 'ending_price',
       'max_torque_nm', 'max_torque_rpm', 'max_power_bhp', 'max_power_rp',
       'brand', 'fuel_efficiency'],
      dtype='object')

In [22]:
pickle.dump(linear_model, open('ds1_reg.pkl', 'wb'))