In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Disable low_memory option
data = pd.read_csv('vehicles.csv', low_memory=False)

In [3]:
print(data.head())

   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  \
0  14.167143         0.0        0.0        0.0      19      0.0        0   
1  27.046364         0.0        0.0        0.0       9      0.0        0   
2  11.018889         0.0        0.0        0.0      23      0.0        0   
3  27.046364         0.0        0.0        0.0      10      0.0        0   
4  15.658421         0.0        0.0        0.0      17      0.0        0   

   cityA08U  cityCD  cityE  ...  mfrCode  c240Dscr  charge240b  c240bDscr  \
0       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
1       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
2       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
3       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
4       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   

                      createdOn                    modifiedOn  startStop  \
0  T

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46839 entries, 0 to 46838
Data columns (total 84 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barrels08        46839 non-null  float64
 1   barrelsA08       46839 non-null  float64
 2   charge120        46839 non-null  float64
 3   charge240        46839 non-null  float64
 4   city08           46839 non-null  int64  
 5   city08U          46839 non-null  float64
 6   cityA08          46839 non-null  int64  
 7   cityA08U         46839 non-null  float64
 8   cityCD           46839 non-null  float64
 9   cityE            46839 non-null  float64
 10  cityUF           46839 non-null  float64
 11  co2              46839 non-null  int64  
 12  co2A             46839 non-null  int64  
 13  co2TailpipeAGpm  46839 non-null  float64
 14  co2TailpipeGpm   46839 non-null  float64
 15  comb08           46839 non-null  int64  
 16  comb08U          46839 non-null  float64
 17  combA08     

In [5]:
data.describe()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
count,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,...,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0
mean,15.254678,0.190728,0.0,0.137831,19.350755,8.321708,0.878819,0.740117,0.000501,0.800725,...,24.618617,1.170996,35.637203,0.961148,2004.014902,-5538.5416,0.01566,0.298512,0.301544,0.298533
std,4.395047,0.978736,0.0,1.118495,11.026435,14.2998,6.690077,6.476217,0.036873,6.002955,...,15.747436,9.455378,14.200534,6.368523,12.394946,4428.850517,0.323361,3.940135,3.815051,3.858476
min,0.047081,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1984.0,-39000.0,0.0,0.0,0.0,0.0
25%,12.39625,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,...,18.6024,0.0,28.1,0.0,1992.0,-8250.0,0.0,0.0,0.0,0.0
50%,14.8755,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,...,22.0,0.0,33.8,0.0,2005.0,-5500.0,0.0,0.0,0.0,0.0
75%,17.500588,0.0,0.0,0.0,21.0,17.27275,0.0,0.0,0.0,0.0,...,26.6,0.0,39.8986,0.0,2015.0,-2750.0,0.0,0.0,0.0,0.0
max,42.501429,16.528333,0.0,19.0,153.0,155.8242,145.0,145.0835,5.35,122.0,...,224.8,207.2622,187.1,173.1436,2024.0,7000.0,9.6,97.0,81.0,89.0


In [6]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31689
phevCity          0
phevHwy           0
phevComb          0
Length: 84, dtype: int64


In [7]:
duplicates = data.duplicated()
print("Number of Duplicate Rows:", duplicates.sum())

Number of Duplicate Rows: 0


In [8]:
data.head(5)

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,14.167143,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,27.046364,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,11.018889,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,27.046364,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,15.658421,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [9]:
print(data.head())

   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  \
0  14.167143         0.0        0.0        0.0      19      0.0        0   
1  27.046364         0.0        0.0        0.0       9      0.0        0   
2  11.018889         0.0        0.0        0.0      23      0.0        0   
3  27.046364         0.0        0.0        0.0      10      0.0        0   
4  15.658421         0.0        0.0        0.0      17      0.0        0   

   cityA08U  cityCD  cityE  ...  mfrCode  c240Dscr  charge240b  c240bDscr  \
0       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
1       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
2       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
3       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   
4       0.0     0.0    0.0  ...      NaN       NaN         0.0        NaN   

                      createdOn                    modifiedOn  startStop  \
0  T

In [10]:
data.describe()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,UCity,UCityA,UHighway,UHighwayA,year,youSaveSpend,charge240b,phevCity,phevHwy,phevComb
count,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,...,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0,46839.0
mean,15.254678,0.190728,0.0,0.137831,19.350755,8.321708,0.878819,0.740117,0.000501,0.800725,...,24.618617,1.170996,35.637203,0.961148,2004.014902,-5538.5416,0.01566,0.298512,0.301544,0.298533
std,4.395047,0.978736,0.0,1.118495,11.026435,14.2998,6.690077,6.476217,0.036873,6.002955,...,15.747436,9.455378,14.200534,6.368523,12.394946,4428.850517,0.323361,3.940135,3.815051,3.858476
min,0.047081,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1984.0,-39000.0,0.0,0.0,0.0,0.0
25%,12.39625,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,...,18.6024,0.0,28.1,0.0,1992.0,-8250.0,0.0,0.0,0.0,0.0
50%,14.8755,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,...,22.0,0.0,33.8,0.0,2005.0,-5500.0,0.0,0.0,0.0,0.0
75%,17.500588,0.0,0.0,0.0,21.0,17.27275,0.0,0.0,0.0,0.0,...,26.6,0.0,39.8986,0.0,2015.0,-2750.0,0.0,0.0,0.0,0.0
max,42.501429,16.528333,0.0,19.0,153.0,155.8242,145.0,145.0835,5.35,122.0,...,224.8,207.2622,187.1,173.1436,2024.0,7000.0,9.6,97.0,81.0,89.0


In [11]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31689
phevCity          0
phevHwy           0
phevComb          0
Length: 84, dtype: int64


In [12]:
data = data.dropna(subset=['startStop'])

In [13]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
barrels08     0
barrelsA08    0
charge120     0
charge240     0
city08        0
             ..
modifiedOn    0
startStop     0
phevCity      0
phevHwy       0
phevComb      0
Length: 84, dtype: int64


In [14]:
# Select relevant columns
data = data[['displ', 'cylinders', 'comb08']]


# Encode categorical variables if needed
# For example, 'cylinders' can be one-hot encoded
data = pd.get_dummies(data, columns=['cylinders'], drop_first=True)


KeyError: "['weight'] not in index"