# Regression Models using scikit-learn

### Linear Regression

In [2]:
import seaborn as sns
import pandas as pd

data = sns.load_dataset('car_crashes')
data.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


### Preprocess the Data

#### Select features and target variable

In [3]:
X = data.drop(columns=['total'])
y = data['total']

#### One-hot encode categorical variables if any

In [4]:
X = pd.get_dummies(X, drop_first=True)

In [5]:
X

Unnamed: 0,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev_AL,abbrev_AR,abbrev_AZ,abbrev_CA,...,abbrev_SD,abbrev_TN,abbrev_TX,abbrev_UT,abbrev_VA,abbrev_VT,abbrev_WA,abbrev_WI,abbrev_WV,abbrev_WY
0,7.332,5.64,18.048,15.04,784.55,145.08,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7.421,4.525,16.29,17.014,1053.48,133.93,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,6.51,5.208,15.624,17.856,899.47,110.35,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4.032,5.824,21.056,21.28,827.34,142.39,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4.2,3.36,10.92,10.68,878.41,165.63,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
5,5.032,3.808,10.744,12.92,835.5,139.91,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,4.968,3.888,9.396,8.856,1068.73,167.02,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,6.156,4.86,14.094,16.038,1137.87,151.48,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,2.006,1.593,5.9,5.9,1273.89,136.05,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,3.759,5.191,16.468,16.826,1160.13,144.18,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Split the Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train

Unnamed: 0,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev_AL,abbrev_AR,abbrev_AZ,abbrev_CA,...,abbrev_SD,abbrev_TN,abbrev_TX,abbrev_UT,abbrev_VA,abbrev_VT,abbrev_WA,abbrev_WI,abbrev_WV,abbrev_WY
8,2.006,1.593,5.9,5.9,1273.89,136.05,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49,4.968,4.554,5.382,11.592,670.31,106.62,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
6,4.968,3.888,9.396,8.856,1068.73,167.02,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
47,4.452,3.498,8.692,9.116,890.03,111.62,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,4.2,3.36,10.92,10.68,878.41,165.63,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
36,6.368,5.771,18.308,18.706,881.51,178.86,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
33,6.552,5.208,15.792,13.608,708.24,127.82,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
19,5.738,4.53,13.137,12.684,661.88,96.57,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
48,8.092,6.664,23.086,20.706,992.61,152.56,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
15,2.669,3.925,15.229,13.659,649.06,114.47,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
y_train

8      5.9
49    13.8
6     10.8
47    10.6
4     12.0
36    19.9
33    16.8
19    15.1
48    23.8
15    15.7
9     17.9
16    17.8
26    21.4
44    11.3
25    16.1
11    17.5
0     18.8
45    13.6
27    14.9
34    23.9
5     13.6
29    11.6
37    12.8
1     18.1
21     8.2
2     18.6
39    11.1
35    14.1
23     9.6
41    19.4
10    15.6
22    14.1
18    20.5
50    17.4
20    12.5
7     16.2
42    19.5
14    14.5
28    14.7
38    18.2
Name: total, dtype: float64

### Standardize the Data

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
X_train

array([[-1.61624074, -1.93115252, -1.87254625, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [-0.08839774, -0.14763279, -1.99882258, ...,  6.244998  ,
        -0.16012815, -0.16012815],
       [-0.08839774, -0.5487892 , -1.02030295, ..., -0.16012815,
        -0.16012815, -0.16012815],
       ...,
       [-0.78113682, -0.35784839,  0.04719517, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [ 0.15455097, -0.05728224,  0.09351274, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [ 2.04294842,  0.50770982,  0.94843758, ..., -0.16012815,
        -0.16012815, -0.16012815]])

In [13]:
X_test

array([[ 1.35175678,  1.54975276,  0.99280494,  0.87969515,  0.72831399,
         0.92286639, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815,  0.        ,  0.        ,
        -0.16012815, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
         0.        , -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815,  0.        ,  0.        , -0.16012815,  0.        ,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
         0.        , -0.16012815, -0.16012815,  1.        , -0.16012815,
         0.        , -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815],
       [ 2.03366375,  3.01162456,  2.28238362,  1.56538725, -0.1152543 ,
        -0.68856492, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012

###  Train and Evaluate Different Regression Models

In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

#### Helper function to train and evaluate models

In [15]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

#### Initialize models

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression": SVR(kernel='rbf')
}

#### Train and evaluate models

In [17]:
results = {}
for name, model in models.items():
    mse, r2 = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[name] = {"MSE": mse, "R^2": r2}

In [18]:
results_df = pd.DataFrame(results).T

In [19]:
results_df

Unnamed: 0,MSE,R^2
Linear Regression,3.264297,0.819512
Ridge Regression,4.764088,0.736586
Lasso Regression,2.42391,0.865978
Polynomial Regression,13.897211,0.231602
Decision Tree Regression,3.1,0.828596
Random Forest Regression,2.483351,0.862692
Support Vector Regression,12.233285,0.323603


# Now doing one by one for every model

### Common Steps: Load, Preprocess, and Split the Data

In [21]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
data = sns.load_dataset('car_crashes')
data

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL


### Select features and target variable

In [23]:
X = data.drop(columns=['total'])
y = data['total']

###  One-hot encode categorical variables if any

In [25]:
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev_AL,abbrev_AR,abbrev_AZ,abbrev_CA,...,abbrev_SD,abbrev_TN,abbrev_TX,abbrev_UT,abbrev_VA,abbrev_VT,abbrev_WA,abbrev_WI,abbrev_WV,abbrev_WY
0,7.332,5.64,18.048,15.04,784.55,145.08,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7.421,4.525,16.29,17.014,1053.48,133.93,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,6.51,5.208,15.624,17.856,899.47,110.35,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4.032,5.824,21.056,21.28,827.34,142.39,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4.2,3.36,10.92,10.68,878.41,165.63,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
5,5.032,3.808,10.744,12.92,835.5,139.91,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,4.968,3.888,9.396,8.856,1068.73,167.02,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,6.156,4.86,14.094,16.038,1137.87,151.48,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,2.006,1.593,5.9,5.9,1273.89,136.05,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,3.759,5.191,16.468,16.826,1160.13,144.18,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Split the dataset into training and testing sets

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Standardize the features

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
X_test

array([[ 1.35175678,  1.54975276,  0.99280494,  0.87969515,  0.72831399,
         0.92286639, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815,  0.        ,  0.        ,
        -0.16012815, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
         0.        , -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815,  0.        ,  0.        , -0.16012815,  0.        ,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
         0.        , -0.16012815, -0.16012815,  1.        , -0.16012815,
         0.        , -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815],
       [ 2.03366375,  3.01162456,  2.28238362,  1.56538725, -0.1152543 ,
        -0.68856492, -0.16012815,  0.        , -0.16012815, -0.16012815,
        -0.16012815, -0.16012

In [29]:
X_train

array([[-1.61624074, -1.93115252, -1.87254625, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [-0.08839774, -0.14763279, -1.99882258, ...,  6.244998  ,
        -0.16012815, -0.16012815],
       [-0.08839774, -0.5487892 , -1.02030295, ..., -0.16012815,
        -0.16012815, -0.16012815],
       ...,
       [-0.78113682, -0.35784839,  0.04719517, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [ 0.15455097, -0.05728224,  0.09351274, ..., -0.16012815,
        -0.16012815, -0.16012815],
       [ 2.04294842,  0.50770982,  0.94843758, ..., -0.16012815,
        -0.16012815, -0.16012815]])

# Model 1: Linear Regression

In [30]:
from sklearn.linear_model import LinearRegression

#### Initialize the model

In [31]:
linear_reg = LinearRegression()

#### Train the model

In [34]:
linear_reg.fit(X_train, y_train)

#### Make predictions

In [35]:
y_pred = linear_reg.predict(X_test)

In [36]:
y_pred

array([19.49455618, 22.88781966, 14.38767388, 15.84671344, 14.99213963,
       18.60031607, 19.09705812, 14.41097188, 23.26865421, 14.38852572,
       14.8505168 ])

#### Evaluate the model

In [37]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [40]:
print("Linear Regression")
mse

Linear Regression


3.2642972692935577

In [41]:
r2

0.8195119861156459

# Model 2: Ridge Regression

In [42]:
from sklearn.linear_model import Ridge

#### Initialize the model

In [43]:
ridge_reg = Ridge(alpha=1.0)

#### Train the model

In [45]:
ridge_reg.fit(X_train, y_train)

#### Make predictions

In [46]:
y_pred = ridge_reg.predict(X_test)

In [47]:
y_pred

array([19.41893735, 22.80944573, 12.96652096, 15.96249972, 13.62401476,
       16.26611984, 16.90477034, 12.87509419, 19.82531245, 11.39776822,
       14.57130242])

#### Evaluate the model

In [48]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [49]:
print("\nRidge Regression")
mse


Ridge Regression


4.764088469909432

In [50]:
r2

0.7365862251603721

# Model 3: Lasso Regression

In [51]:
from sklearn.linear_model import Lasso

#### Initialize the model

In [52]:
lasso_reg = Lasso(alpha=0.1)

#### Train the model

In [54]:
lasso_reg.fit(X_train, y_train)

#### Make predictions

In [55]:
y_pred = lasso_reg.predict(X_test)
y_pred

array([19.43343915, 23.33503168, 12.53313791, 15.97741644, 14.63392043,
       17.55660668, 17.67501697, 11.68550572, 22.14189875, 10.36336094,
       13.95210499])

#### Evaluate the model

In [56]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [57]:
print("\nLasso Regression")
mse


Lasso Regression


2.423910230110863

In [58]:
r2

0.8659782773517573

# Model 4: Polynomial Regression

In [59]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#### Create polynomial features

In [60]:
poly_reg = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_reg

#### Train the model

In [62]:
poly_reg.fit(X_train, y_train)

#### Make predictions

In [63]:
y_pred = poly_reg.predict(X_test)
y_pred

array([17.82859493, 17.5254714 , 16.51046954, 16.71526367, 16.57663026,
       17.1781195 , 17.18500315, 16.10626259, 17.30953118, 15.29765449,
       16.76799527])

#### Evaluate the model

In [64]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [65]:
print("\nPolynomial Regression")
mse


Polynomial Regression


13.897210958183553

In [66]:
r2

0.23160184338319756

# Model 5: Decision Tree Regression

In [67]:
from sklearn.tree import DecisionTreeRegressor

#### Initialize the model

In [68]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg

#### Train the model

In [69]:
tree_reg.fit(X_train, y_train)

#### Make predictions


In [71]:
y_pred = tree_reg.predict(X_test)

In [72]:
y_pred

array([18.2, 23.8, 12.8, 17.4, 17.8, 19.9, 17.9, 11.1, 23.9,  8.2, 13.8])

#### Evaluate the model

In [73]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [74]:
print("\nDecision Tree Regression")
mse


Decision Tree Regression


3.0999999999999996

In [75]:
r2

0.8285962346920124

# Model 6: Random Forest Regression

In [76]:
from sklearn.ensemble import RandomForestRegressor

#### Initialize the model

In [77]:
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

#### Train the model

In [78]:
forest_reg.fit(X_train, y_train)

#### Make predictions

In [79]:
y_pred = forest_reg.predict(X_test)

In [80]:
y_pred

array([19.656, 21.714, 12.342, 16.364, 15.956, 16.997, 18.032, 10.877,
       21.677,  9.928, 13.816])

#### Evaluate the model

In [81]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [82]:
print("\nRandom Forest Regression")
mse


Random Forest Regression


2.4833508181818016

In [83]:
r2

0.8626917158654733

# Model 7: Support Vector Regression (SVR)

In [84]:
from sklearn.svm import SVR

#### Initialize the model

In [85]:
svr_reg = SVR(kernel='rbf')

#### Train the model

In [86]:
svr_reg.fit(X_train, y_train)

#### Make predictions

In [87]:
y_pred = svr_reg.predict(X_test)
y_pred

array([17.23988206, 17.81409532, 14.32055631, 15.59889688, 14.68522935,
       15.71123864, 16.07185409, 14.43259865, 17.03552629, 13.94556875,
       15.07334029])

#### Evaluate the model

In [88]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [89]:
print("\nSupport Vector Regression")
mse


Support Vector Regression


12.233284618927037

In [90]:
r2

0.3236028884618114