## SUPERVISED LEARNING: INTRODUCTION TO REGRESSION ANALYSIS

### Step #1 Importing the required libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn
import sklearn

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

### Step#2 Loading the dataset

In [2]:
# TODO: Get the datset ./AI_Invasion_In-Class_Dataset.xlsx form your AI Invasion
# Study Pack
# Note: You can use pandas read_excel to read file with xlsx format

df = pd.read_excel("data/Car.xlsx")

df.head()

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009,Silver,1.69,Nigerian Used,118906.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location            4487 non-null   object 
 1   Maker               4487 non-null   object 
 2   Model               4487 non-null   object 
 3   Year                4487 non-null   int64  
 4   Colour              4487 non-null   object 
 5   Amount (Million ₦)  4487 non-null   float64
 6   Type                4487 non-null   object 
 7   Distance_Km         2932 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 280.6+ KB


In [4]:
df.describe()

Unnamed: 0,Year,Amount (Million ₦),Distance_Km
count,4487.0,4487.0,2932.0
mean,2011.09561,11.309795,101038.3
std,4.823362,20.585915,115091.4
min,1982.0,0.42,1.0
25%,2008.0,3.6,52378.5
50%,2011.0,5.7,79000.0
75%,2014.0,12.0,109939.2
max,2022.0,454.0,1785448.0


### Step#3 Clean the dataset

In [5]:
df.columns

Index(['Location', 'Maker', 'Model', 'Year', 'Colour', 'Amount (Million ₦)',
       'Type', 'Distance_Km'],
      dtype='object')

In [6]:
# Check for missing value
df.isnull().sum()

Location                 0
Maker                    0
Model                    0
Year                     0
Colour                   0
Amount (Million ₦)       0
Type                     0
Distance_Km           1555
dtype: int64

In [7]:
# fill up missing values in Distance_Km will the mean
mean_value = df["Distance_Km"].mean()
print(mean_value)

df["Distance_Km"].fillna(mean_value, inplace=True)

101038.32128240108


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Distance_Km"].fillna(mean_value, inplace=True)


In [8]:
# Check and make sure all missing valuen have been filled
df.isnull().sum()

Location              0
Maker                 0
Model                 0
Year                  0
Colour                0
Amount (Million ₦)    0
Type                  0
Distance_Km           0
dtype: int64

In [9]:
# The main of this section is to rename the different
# class in our categorigal feature that were not properly named.
# or chanage the data type of a column

cat_features = {
    "Location",
    "Model",
    "Maker",
    "Year",
    "Colour",
    "Type",
}

for cat_feature in cat_features:
  print(cat_feature, df[cat_feature].unique(), sep=":")
  print("#"*50)

Colour:['Brown' 'Red' 'White' 'Gray' 'Silver' 'Black' 'Blue' 'Gold' 'Green'
 'Beige' 'Purple' 'Orange' 'Burgandy' 'Ivory' 'Pink' 'Pearl' 'Yellow'
 'Luury' 'Teal']
##################################################
Type:['Foreign Used' 'Nigerian Used' 'Brand New']
##################################################
Maker:['Mercedes-Benz' 'Hyundai' 'Lexus' 'Toyota' 'Mazda' 'Honda' 'Land Rover'
 'Porsche' 'Acura' 'Nissan' 'Pontiac' 'Ford' 'Jeep' 'Kia' 'Peugeot' 'BMW'
 'Mitsubishi' 'Dodge' 'Chevrolet' 'Scion' 'Audi' 'Infiniti' 'Mini'
 'Volkswagen' 'Suzuki' 'Chrysler' 'Volvo' 'Rolls-Royce' 'JAC' 'Subaru'
 'Renault' 'GMC' 'Rover' 'IVM' 'Bentley' 'Opel' 'Lincoln' 'Hummer'
 'Saturn' 'Cadillac' 'Lamborghini' 'Buick' 'Smart' 'Jaguar' 'Ferrari'
 'Tata' 'Skoda']
##################################################
Location:['Abuja' 'Lagos' 'Ibadan']
##################################################
Model:['GLA 250' 'Accent' 'GX 460 Premium' 'ES 350' 'Verso 1.6' 'Corolla 1.8 LE'
 'E350' 'GL-Class' 'R

In [10]:
# Drop the Model feature
df.drop("Model", axis=1, inplace=True)
df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,2015,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,2013,Red,1.55,Nigerian Used,101038.321282
2,Lagos,Lexus,2011,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,2011,Gray,4.95,Foreign Used,101038.321282
4,Ibadan,Toyota,2009,Silver,1.69,Nigerian Used,118906.0


In [11]:
# Label Encoding
cat_features = ["Location","Maker","Year","Colour","Type"]

for value in cat_features:
  df[f"{value}_cat"] = df[value].astype('category')
  df[f"{value}_cat"] = df[f"{value}_cat"].cat.codes


# Read more on Pandas get_dummies

df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,Abuja,Mercedes-Benz,2015,Brown,14.5,Foreign Used,50000.0,0,26,22,3,1
1,Abuja,Hyundai,2013,Red,1.55,Nigerian Used,101038.321282,0,14,20,14,2
2,Lagos,Lexus,2011,White,14.0,Foreign Used,85000.0,2,23,18,17,1
3,Lagos,Lexus,2011,Gray,4.95,Foreign Used,101038.321282,2,23,18,6,1
4,Ibadan,Toyota,2009,Silver,1.69,Nigerian Used,118906.0,1,44,16,15,2


In [12]:
# Drop the reductant features since Label encoding have been done
df.drop(["Location","Maker","Year","Colour", "Type"], axis=1, inplace=True)
df.head()

Unnamed: 0,Amount (Million ₦),Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,14.5,50000.0,0,26,22,3,1
1,1.55,101038.321282,0,14,20,14,2
2,14.0,85000.0,2,23,18,17,1
3,4.95,101038.321282,2,23,18,6,1
4,1.69,118906.0,1,44,16,15,2


### Step#4 Perform data segmentation

In [13]:
y = df["Amount (Million ₦)"] # Target
X = df.drop("Amount (Million ₦)", axis=1)

In [14]:
X

Unnamed: 0,Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,50000.000000,0,26,22,3,1
1,101038.321282,0,14,20,14,2
2,85000.000000,2,23,18,17,1
3,101038.321282,2,23,18,6,1
4,118906.000000,1,44,16,15,2
...,...,...,...,...,...,...
4482,90282.000000,2,23,13,2,1
4483,85000.000000,2,23,14,2,1
4484,65214.000000,0,26,21,7,1
4485,45000.000000,2,23,27,1,1


In [15]:
y

0       14.50
1        1.55
2       14.00
3        4.95
4        1.69
        ...  
4482     4.60
4483     4.50
4484    10.45
4485    31.00
4486    14.00
Name: Amount (Million ₦), Length: 4487, dtype: float64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state= 42)

### Step#5 Load your data into the Linear Regression model i.e Train your model

In [18]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Step#6 Make predictions

In [19]:
reg.predict(X_test)

array([-10.01035516,   4.49714596,   4.92166264, ...,  39.46491808,
         1.76405796,  24.09943329], shape=(1347,))

## Step#7 Evaluate your model

In [20]:
from sklearn.metrics import mean_absolute_error

# we are using mean_absolute_error because
# this is a regression model

y_pred_LR = reg.predict(X_test)

print("MAE",mean_absolute_error(y_test,y_pred_LR))

MAE 7.6174215959735


# Other Machine Learning Algorithms

## Decision Tree

In [21]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)
y_pred_DT = dt_reg.predict(X_test)

print("MAE",mean_absolute_error(y_test,y_pred_DT))

MAE 4.933693540760774


## SVM

In [22]:
from sklearn.svm import SVR

sv_reg = SVR()
sv_reg.fit(X_train, y_train)
y_pred_SVM = sv_reg.predict(X_test)
print("MAE",mean_absolute_error(y_test,y_pred_SVM))

MAE 7.021686771404679


## Random Forest (Activity)

In [23]:
from sklearn.ensemble import RandomForestRegressor

rfR = RandomForestRegressor()
rfR.fit(X_train, y_train)
y_pred_RF = rfR.predict(X_test)
print("MAE",mean_absolute_error(y_test,y_pred_RF))

MAE 4.437597806745424


In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [25]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "SVM","Random Forest"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_LR),
        mean_absolute_error(y_test, y_pred_DT),
        mean_absolute_error(y_test, y_pred_SVM),
        mean_absolute_error(y_test, y_pred_RF)
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test, y_pred_)),
        np.sqrt(mean_squared_error(y_test, y_pred)),
        np.sqrt(mean_squared_error(y_test, y_pred)),
        np.sqrt(mean_squared_error(y_test, y_pred))
    ],
    "R2 Score": [
        r2_score(y_test, y_pred),
        r2_score(y_test, y_pred),
         r2_score(y_test, y_pred),
        r2_score(y_test, y_pred)
    ]
})

print("\n===== Model Comparison Table =====")
print(results)


NameError: name 'y_pred_' is not defined

In [29]:
import joblib

joblib.dump(rfR, "car.pkl")
print("Model saved!")


Model saved!


In [30]:
# Sample input (use values based on your dataset)
sample_data = {
    "Distance_Km": [100],
    "Location_cat": [2],
    "Maker_cat": [23],
    "Year_cat": [10],
    "Colour_cat": [1],
    "Type_cat": [1]
}

sample_df = pd.DataFrame(sample_data)
sample_df


Unnamed: 0,Distance_Km,Location_cat,Maker_cat,Year_cat,Colour_cat,Type_cat
0,100,2,23,10,1,1


In [31]:
prediction = rfR.predict(sample_df)
print("Predicted Amount (Million ₦):", prediction[0])


Predicted Amount (Million ₦): 3.451750000000002


In [32]:
print("Decision Tree Prediction:", dt_reg.predict(sample_df)[0])
print("SVM Prediction:", sv_reg.predict(sample_df)[0])


Decision Tree Prediction: 3.3
SVM Prediction: 21.277299889606
