# Walkthrough
1. **Cleaning Dataset**
    * Removing unnecessary columns.
    * Organizing columns.
    * Handling missing values.
    * Organizing values in rows.
    * Converting data types.
2. **Converting Dataset**   
    * Converting categorical object columns into numerical columns.
    * Separating numerical columns into a new dataset.
    * Merging converted dataset with the new dataset.
3. **Splitting Dataset**
    * Split merged dataset into X & y.
    * Split X & y into train & test.
4. **Model Creation**
    * Choosing the first estimator/algorithm for the problem.
    * Create the model.
    * Fit the model.
    * Score the model.
    * Choosing other estimators/algorithms for the problem.
    * Comparing the scores and selecting the best model for the problem.

##### Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn

##### Pandas options

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

##### Sklearn options

In [3]:
from sklearn.model_selection import train_test_split as t_t_s

##### Dataset

In [4]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.28015
1,Acura,TL,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
2,Acura,CL,14.114,18.225,Passenger,,3.2,225.0,106.9,70.6,192.0,3.47,17.2,26.0,1/4/2012,
3,Acura,RL,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639


## Cleaning Dataset

In [5]:
# Removing unnecessary columns

dataset.drop('Latest_Launch', axis='columns', inplace=True)

In [6]:
# Organizing columns

dataset.columns = dataset.columns.str.lower()
dataset.columns = dataset.columns.str.replace(' ', '_')

dataset.columns

Index(['manufacturer', 'model', 'sales_in_thousands', '__year_resale_value',
       'vehicle_type', 'price_in_thousands', 'engine_size', 'horsepower',
       'wheelbase', 'width', 'length', 'curb_weight', 'fuel_capacity',
       'fuel_efficiency', 'power_perf_factor'],
      dtype='object')

In [7]:
# Handling missing values

dataset.isna().sum()

manufacturer            0
model                   0
sales_in_thousands      0
__year_resale_value    36
vehicle_type            0
price_in_thousands      2
engine_size             1
horsepower              1
wheelbase               1
width                   1
length                  1
curb_weight             2
fuel_capacity           1
fuel_efficiency         3
power_perf_factor       2
dtype: int64

In [8]:
# Dropping all rows contains (Nan) values because the number of missing values are so low

dataset.dropna(axis='rows', inplace=True)

dataset.isna().sum()

manufacturer           0
model                  0
sales_in_thousands     0
__year_resale_value    0
vehicle_type           0
price_in_thousands     0
engine_size            0
horsepower             0
wheelbase              0
width                  0
length                 0
curb_weight            0
fuel_capacity          0
fuel_efficiency        0
power_perf_factor      0
dtype: int64

In [9]:
# Organizing values in rows

dataset.dtypes

manufacturer            object
model                   object
sales_in_thousands     float64
__year_resale_value    float64
vehicle_type            object
price_in_thousands     float64
engine_size            float64
horsepower             float64
wheelbase              float64
width                  float64
length                 float64
curb_weight            float64
fuel_capacity          float64
fuel_efficiency        float64
power_perf_factor      float64
dtype: object

In [10]:
# Checking if (manufacturer) is categorised

dataset['manufacturer'].value_counts()

# Categorised

Ford          10
Dodge          9
Chevrolet      8
Toyota         8
Mitsubishi     7
Mercury        6
Chrysler       5
Honda          5
Pontiac        5
Nissan         5
Volkswagen     5
Mercedes-B     4
Oldsmobile     4
Buick          4
Saturn         3
Porsche        3
Plymouth       3
Acura          3
Audi           3
Jeep           3
Hyundai        3
Cadillac       3
Lexus          3
Lincoln        2
BMW            2
Infiniti       1
Name: manufacturer, dtype: int64

In [11]:
# Checking if (model) is categorised

dataset['model'].unique()

# Categorised

array(['Integra', 'TL', 'RL', 'A4', 'A6', 'A8', '328i', '528i', 'Century',
       'Regal', 'Park Avenue', 'LeSabre', 'DeVille', 'Eldorado', 'Catera',
       'Cavalier', 'Malibu', 'Lumina', 'Monte Carlo', 'Camaro',
       'Corvette', 'Prizm', 'Metro', 'Sebring Coupe', 'Sebring Conv.',
       'Concorde', 'Cirrus', 'LHS', 'Neon', 'Avenger', 'Stratus', 'Viper',
       'Ram Pickup', 'Ram Wagon', 'Ram Van', 'Dakota', 'Caravan',
       'Escort', 'Mustang', 'Contour', 'Taurus', 'Crown Victoria',
       'Explorer', 'Windstar', 'Expedition', 'Ranger', 'F-Series',
       'Civic', 'Accord', 'CR-V', 'Passport', 'Odyssey', 'Accent',
       'Elantra', 'Sonata', 'I30', 'Wrangler', 'Cherokee',
       'Grand Cherokee', 'ES300', 'GS300', 'LS400', 'Continental',
       'Town car', 'Mirage', 'Eclipse', 'Galant', 'Diamante', '3000GT',
       'Montero', 'Montero Sport', 'Mystique', 'Cougar', 'Sable',
       'Grand Marquis', 'Mountaineer', 'Villager', 'C-Class', 'E-Class',
       'S-Class', 'SL-Class', 'Sentr

In [12]:
# Checking if (vehicle_type) is categorised

dataset['vehicle_type'].value_counts()

# Categorised

Passenger    88
Car          29
Name: vehicle_type, dtype: int64

## Converting Dataset

In [13]:
# Converting categorical object columns into numerical columns

conv_dataset = pd.get_dummies(dataset[['manufacturer', 'model', 'vehicle_type']])

conv_dataset.head(1)

Unnamed: 0,manufacturer_Acura,manufacturer_Audi,manufacturer_BMW,manufacturer_Buick,manufacturer_Cadillac,manufacturer_Chevrolet,manufacturer_Chrysler,manufacturer_Dodge,manufacturer_Ford,manufacturer_Honda,manufacturer_Hyundai,manufacturer_Infiniti,manufacturer_Jeep,manufacturer_Lexus,manufacturer_Lincoln,manufacturer_Mercedes-B,manufacturer_Mercury,manufacturer_Mitsubishi,manufacturer_Nissan,manufacturer_Oldsmobile,manufacturer_Plymouth,manufacturer_Pontiac,manufacturer_Porsche,manufacturer_Saturn,manufacturer_Toyota,manufacturer_Volkswagen,model_3000GT,model_328i,model_4Runner,model_528i,model_A4,model_A6,model_A8,model_Accent,model_Accord,model_Altima,model_Aurora,model_Avalon,model_Avenger,model_Bonneville,model_Boxter,model_Bravada,model_Breeze,model_C-Class,model_CR-V,model_Cabrio,model_Camaro,model_Camry,model_Caravan,model_Carrera Cabrio,model_Carrera Coupe,model_Catera,model_Cavalier,model_Celica,model_Century,model_Cherokee,model_Cirrus,model_Civic,model_Concorde,model_Continental,model_Contour,model_Corolla,model_Corvette,model_Cougar,model_Crown Victoria,model_Cutlass,model_Dakota,model_DeVille,model_Diamante,model_E-Class,model_ES300,model_Eclipse,model_Elantra,model_Eldorado,model_Escort,model_Expedition,model_Explorer,model_F-Series,model_Firebird,model_GS300,model_GTI,model_Galant,model_Golf,model_Grand Am,model_Grand Cherokee,model_Grand Marquis,model_Grand Prix,model_I30,model_Integra,model_Jetta,model_LHS,model_LS400,model_Land Cruiser,model_LeSabre,model_Lumina,model_Malibu,model_Maxima,model_Metro,model_Mirage,model_Monte Carlo,model_Montero,model_Montero Sport,model_Mountaineer,model_Mustang,model_Mystique,model_Neon,model_Odyssey,model_Park Avenue,model_Passat,model_Passport,model_Pathfinder,model_Prizm,model_Quest,model_RAV4,model_RL,model_Ram Pickup,model_Ram Van,model_Ram Wagon,model_Ranger,model_Regal,model_S-Class,model_SC,model_SL,model_SL-Class,model_SW,model_Sable,model_Sebring Conv.,model_Sebring Coupe,model_Sentra,model_Silhouette,model_Sonata,model_Stratus,model_Sunfire,model_TL,model_Tacoma,model_Taurus,model_Town car,model_Villager,model_Viper,model_Voyager,model_Windstar,model_Wrangler,vehicle_type_Car,vehicle_type_Passenger
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [14]:
# Dropping categorical object columns from dataset

dataset.drop(['manufacturer', 'model', 'vehicle_type'], axis='columns', inplace=True)

dataset.columns

Index(['sales_in_thousands', '__year_resale_value', 'price_in_thousands',
       'engine_size', 'horsepower', 'wheelbase', 'width', 'length',
       'curb_weight', 'fuel_capacity', 'fuel_efficiency', 'power_perf_factor'],
      dtype='object')

In [15]:
# Merging two datasets

final_dataset = pd.concat([conv_dataset, dataset], axis='columns')

final_dataset.head()

Unnamed: 0,manufacturer_Acura,manufacturer_Audi,manufacturer_BMW,manufacturer_Buick,manufacturer_Cadillac,manufacturer_Chevrolet,manufacturer_Chrysler,manufacturer_Dodge,manufacturer_Ford,manufacturer_Honda,manufacturer_Hyundai,manufacturer_Infiniti,manufacturer_Jeep,manufacturer_Lexus,manufacturer_Lincoln,manufacturer_Mercedes-B,manufacturer_Mercury,manufacturer_Mitsubishi,manufacturer_Nissan,manufacturer_Oldsmobile,manufacturer_Plymouth,manufacturer_Pontiac,manufacturer_Porsche,manufacturer_Saturn,manufacturer_Toyota,manufacturer_Volkswagen,model_3000GT,model_328i,model_4Runner,model_528i,model_A4,model_A6,model_A8,model_Accent,model_Accord,model_Altima,model_Aurora,model_Avalon,model_Avenger,model_Bonneville,model_Boxter,model_Bravada,model_Breeze,model_C-Class,model_CR-V,model_Cabrio,model_Camaro,model_Camry,model_Caravan,model_Carrera Cabrio,model_Carrera Coupe,model_Catera,model_Cavalier,model_Celica,model_Century,model_Cherokee,model_Cirrus,model_Civic,model_Concorde,model_Continental,model_Contour,model_Corolla,model_Corvette,model_Cougar,model_Crown Victoria,model_Cutlass,model_Dakota,model_DeVille,model_Diamante,model_E-Class,model_ES300,model_Eclipse,model_Elantra,model_Eldorado,model_Escort,model_Expedition,model_Explorer,model_F-Series,model_Firebird,model_GS300,model_GTI,model_Galant,model_Golf,model_Grand Am,model_Grand Cherokee,model_Grand Marquis,model_Grand Prix,model_I30,model_Integra,model_Jetta,model_LHS,model_LS400,model_Land Cruiser,model_LeSabre,model_Lumina,model_Malibu,model_Maxima,model_Metro,model_Mirage,model_Monte Carlo,model_Montero,model_Montero Sport,model_Mountaineer,model_Mustang,model_Mystique,model_Neon,model_Odyssey,model_Park Avenue,model_Passat,model_Passport,model_Pathfinder,model_Prizm,model_Quest,model_RAV4,model_RL,model_Ram Pickup,model_Ram Van,model_Ram Wagon,model_Ranger,model_Regal,model_S-Class,model_SC,model_SL,model_SL-Class,model_SW,model_Sable,model_Sebring Conv.,model_Sebring Coupe,model_Sentra,model_Silhouette,model_Sonata,model_Stratus,model_Sunfire,model_TL,model_Tacoma,model_Taurus,model_Town car,model_Villager,model_Viper,model_Voyager,model_Windstar,model_Wrangler,vehicle_type_Car,vehicle_type_Passenger,sales_in_thousands,__year_resale_value,price_in_thousands,engine_size,horsepower,wheelbase,width,length,curb_weight,fuel_capacity,fuel_efficiency,power_perf_factor
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16.919,16.36,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,58.28015
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,39.384,19.875,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,91.370778
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8.588,29.725,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,91.389779
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,20.397,22.255,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,62.777639
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,18.78,23.555,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,84.565105


## Splitting Dataset

In [16]:
# Splitting final_dataset into X & y

X = final_dataset.drop(['price_in_thousands'], axis='columns')
y = final_dataset['price_in_thousands']

In [17]:
# Converting X & y into dataframe

dfX = pd.DataFrame(X)
dfy = pd.DataFrame(y)

dfX.columns, dfy.columns

(Index(['manufacturer_Acura', 'manufacturer_Audi', 'manufacturer_BMW',
        'manufacturer_Buick', 'manufacturer_Cadillac', 'manufacturer_Chevrolet',
        'manufacturer_Chrysler', 'manufacturer_Dodge', 'manufacturer_Ford',
        'manufacturer_Honda',
        ...
        '__year_resale_value', 'engine_size', 'horsepower', 'wheelbase',
        'width', 'length', 'curb_weight', 'fuel_capacity', 'fuel_efficiency',
        'power_perf_factor'],
       dtype='object', length=155),
 Index(['price_in_thousands'], dtype='object'))

In [18]:
# Splitting X & y into train & test

X_train, X_test, y_train, y_test = t_t_s(X, y, test_size=0.2)

len(X_train), len(X_test), len(y_train), len(y_test)

(93, 24, 93, 24)

## Model Creation

In [33]:
# Choosing the first estimators/algorithms for the problem (Regressor Problem)

from sklearn.svm import SVR

# Create the model

model = SVR()

# Fit the model

model.fit(X_train, y_train)

# Score the model

SVR_model = model.score(X_test, y_test)
SVR_model

0.4675129277923924

In [34]:
# Choosing another estimators/algorithms for the problem (Regressor Problem)

from sklearn.linear_model import Ridge

# Creat the model

model = Ridge()

# Fit the model

model.fit(X_train, y_train)

# Score the model

Ridge_model = model.score(X_test, y_test)
Ridge_model

0.999897887581695

In [36]:
# Choosing another estimator/algorithm for the problem (Regressor Problem)

from sklearn.ensemble import RandomForestRegressor

# Creat the model

model = RandomForestRegressor()

# Fit the model

model.fit(X_train, y_train)

# Score the model

RandomForestRegressor_model = model.score(X_test, y_test)
RandomForestRegressor_model

0.822253511577268

In [37]:
# Comparing the scores and selecting the best model for the problem

best_model = Ridge_model > RandomForestRegressor_model > SVR_model

best_model

True