In [233]:
# %pip install streamlit

In [234]:
# Import necessary modules.
import pandas as pd
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [235]:
# Load the dataset.
car_data = pd.read_csv("/Users/tonyx/Downloads/CarPriceData.csv")
car_data

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


# Data Cleaning

In [236]:
# Extract the name of the car manufactures.
car_name = pd.Series([car.split(" ")[0] for car in car_data['CarName']], index = car_data.index)
car_name

0      alfa-romero
1      alfa-romero
2      alfa-romero
3             audi
4             audi
          ...     
200          volvo
201          volvo
202          volvo
203          volvo
204          volvo
Length: 205, dtype: object

In [237]:
# Create new column for car company names.
car_data['car_company'] = car_name

# Replace misspelled car company names
corrections = {
    "vw": "volkswagen",
    "vokswagen": "volkswagen",
    "porcshce": "porsche",
    "toyouta": "toyota",
    "Nissan": "nissan",
    "maxda": "mazda"
}
car_data['car_company'].replace(corrections, inplace=True)

cylinder_mapping = {
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'ten': 10,
    'twelve': 12
}

car_data['cylindernumber'].replace(cylinder_mapping, inplace=True)

car_body_type = {
    "hardtop": 1,
    "hatchback": 2,
    "sedan": 3,
    "wagon": 4,
    "convertible":5
}
car_data['carbody'].replace(car_body_type, inplace=True)

# Drop the 'CarName' column
car_data.drop(columns=['CarName'], inplace=True)

# Select numeric columns
cars_numeric_data = car_data.select_dtypes(include=['int64', 'float64'])

# Drop the 'car_ID' column
cars_numeric_data.drop(columns=['car_ID'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_data['car_company'].replace(corrections, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_data['cylindernumber'].replace(cylinder_mapping, inplace=True)
  car_data['cylindernumber'].replace(cylinder_mapping, inplace=True)
The behavior will change in pandas 3.

In [238]:
car_data

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,car_company
0,1,3,gas,std,two,5,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,2,3,gas,std,two,5,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,3,1,gas,std,two,2,rwd,front,94.5,171.2,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,4,2,gas,std,four,3,fwd,front,99.8,176.6,...,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0,audi
4,5,2,gas,std,four,3,4wd,front,99.4,176.6,...,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0,audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,gas,std,four,3,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo
201,202,-1,gas,turbo,four,3,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo
202,203,-1,gas,std,four,3,rwd,front,109.1,188.8,...,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo
203,204,-1,diesel,turbo,four,3,rwd,front,109.1,188.8,...,idi,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo


In [239]:
rows_with_null= car_data[car_data.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,car_company


In [240]:
columns_to_drop = ['wheelbase','carwidth','carheight','curbweight','boreratio','stroke','compressionratio']
car_data.drop(columns=columns_to_drop, inplace=True)
mean_mpg = car_data[['citympg', 'highwaympg']].mean(axis=1)
car_data['average_mpg'] = mean_mpg
car_data.columns
print(car_data.head(10))

   car_ID  symboling fueltype aspiration doornumber  carbody drivewheel  \
0       1          3      gas        std        two        5        rwd   
1       2          3      gas        std        two        5        rwd   
2       3          1      gas        std        two        2        rwd   
3       4          2      gas        std       four        3        fwd   
4       5          2      gas        std       four        3        4wd   
5       6          2      gas        std        two        3        fwd   
6       7          1      gas        std       four        3        fwd   
7       8          1      gas        std       four        4        fwd   
8       9          1      gas      turbo       four        3        fwd   
9      10          0      gas      turbo        two        2        4wd   

  enginelocation  carlength enginetype  cylindernumber  enginesize fuelsystem  \
0          front      168.8       dohc               4         130       mpfi   
1          f

In [241]:
columns_to_drop = ['peakrpm','citympg','highwaympg','symboling','fueltype','aspiration','doornumber','enginelocation','fuelsystem','peakrpm','citympg','highwaympg']
car_data.columns

Index(['car_ID', 'symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'carlength', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'horsepower', 'peakrpm',
       'citympg', 'highwaympg', 'price', 'car_company', 'average_mpg'],
      dtype='object')

In [242]:
X = car_data[['carbody', 'carlength','cylindernumber','enginesize','horsepower','average_mpg']]  
y = car_data['price']  # 选择你的目标变量列
car_data['carbody']

0      5
1      5
2      2
3      3
4      3
      ..
200    3
201    3
202    3
203    3
204    3
Name: carbody, Length: 205, dtype: int64

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

In [244]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print('MSE:', mse)
print('R^2:', r2)

MSE: 15845895.917584207
R^2: 0.7992768072741101
