In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
base = "/content/drive/MyDrive/auto-imports-regression-eda"
clean_csv = f"{base}/data/auto_imports_clean.csv"

In [14]:
# --- Libraries ---
import pandas as pd
import statsmodels.api as sm

In [15]:
# --- Load cleaned data ---
df2 = pd.read_csv(clean_csv)
print("Data shape:", df2.shape)

Data shape: (195, 15)


In [16]:
# --- Ensure all features are numeric ---
non_numeric_cols = df2.select_dtypes(include=['object']).columns
print("Non-numeric columns to drop:", list(non_numeric_cols))

df2 = df2.drop(columns=non_numeric_cols, errors='ignore')
df2 = df2.apply(pd.to_numeric, errors='coerce')
df2 = df2.dropna()
print("Final numeric dataset shape:", df2.shape)


Non-numeric columns to drop: []
Final numeric dataset shape: (195, 15)


In [17]:
# --- Convert everything strictly to float numpy arrays ---
df2 = df2.apply(pd.to_numeric, errors='coerce')
df2 = df2.dropna()

print("\nData types after conversion:\n", df2.dtypes)
print("Shape:", df2.shape)


Data types after conversion:
 wheel_base       float64
length           float64
width            float64
heights          float64
curb_weight        int64
engine_size        int64
bore             float64
stroke           float64
comprassion      float64
horse_power      float64
peak_rpm         float64
city_mpg           int64
highway_mpg        int64
price              int64
fuel_type_gas       bool
dtype: object
Shape: (195, 15)


In [18]:
# --- Define target and predictors ---
y = df2['price'].to_numpy(dtype=float)
X = df2.drop(columns=['price']).to_numpy(dtype=float)

In [19]:
# --- Add constant manually ---
X = sm.add_constant(X)

In [20]:
# --- Fit model ---
model1 = sm.OLS(y, X).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.849
Method:                 Least Squares   F-statistic:                     78.89
Date:                Fri, 17 Oct 2025   Prob (F-statistic):           5.84e-69
Time:                        18:06:48   Log-Likelihood:                -1838.5
No. Observations:                 195   AIC:                             3707.
Df Residuals:                     180   BIC:                             3756.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -4.45e+04   1.84e+04     -2.419      0.0

Creating Model 2 to drop all the  variables which are not significant, not having 90% confidence level.

In [21]:
columns_dropped = ['city_mpg', 'highway_mpg', 'bore', 'wheel_base', 'length', 'curb_weight', 'horse_power', 'comprassion']

df3 = df2.drop(columns=columns_dropped, axis=1)
df3.info

In [22]:
df3.head()

Unnamed: 0,width,heights,engine_size,stroke,peak_rpm,price,fuel_type_gas
0,64.1,48.8,130,2.68,5000.0,13495,True
1,64.1,48.8,130,2.68,5000.0,16500,True
2,65.5,52.4,152,3.47,5000.0,16500,True
3,66.2,54.3,109,3.4,5500.0,13950,True
4,66.4,54.3,136,3.4,5500.0,17450,True


In [24]:
y = df3['price'].to_numpy(dtype=float)
X = df3.drop(columns=['price']).to_numpy(dtype=float)

X = sm.add_constant(X)
model2 = sm.OLS(y, X).fit()

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.848
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     174.8
Date:                Fri, 17 Oct 2025   Prob (F-statistic):           4.02e-74
Time:                        18:11:02   Log-Likelihood:                -1846.4
No. Observations:                 195   AIC:                             3707.
Df Residuals:                     188   BIC:                             3730.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.288e+04   1.14e+04     -5.526      0.0

In [25]:
print("\nAdjusted R² — Model 1:", round(model1.rsquared_adj,4))
print("Adjusted R² — Model 2:", round(model2.rsquared_adj,4))


Adjusted R² — Model 1: 0.849
Adjusted R² — Model 2: 0.8432
