# [과제 2] 회귀분석
### - Ch 1, Ch 2를 토대로 자유롭게 회귀분석과 회귀진단을 진행해주세요.
### - 주석으로 설명 및 근거 자세하게 달아주시면 감사하겠습니다. :)

In [147]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
import statsmodels.api as sm

# Data 

데이터 출처 : https://www.kaggle.com/avikasliwal/used-cars-price-prediction 

< y > 
* **Price** : The price of the used car in INR Lakhs.



< X > 
* Name : The brand and model of the car
* Location : The location in which the car is being sold or is available for purchase.
* Year : The year or edition of the model.
* Kilometers_Driven : The total kilometres driven in the car by the previous owner(s) in KM.
* Fuel_Type : The type of fuel used by the car. (Petrol, Diesel, Electric, CNG, LPG)
* Transmission : The type of transmission used by the car. (Automatic / Manual)
* Owner_Type : Whether the ownership is Firsthand, Second hand or other.
* Mileage : The standard mileage offered by the car company in kmpl or km/kg
* Engine : The displacement volume of the engine in CC.
* Power : The maximum power of the engine in bhp.
* Seats : The number of seats in the car.
* New_Price : The price of a new car of the same model.

In [148]:
# Load Data 
data = pd.read_csv("assignment2_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [149]:
data.shape

(6019, 14)

In [150]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


In [151]:
data.isnull().sum() # 결측치 확인

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [152]:
data = data.drop(["Unnamed: 0","New_Price","Name","Location","Seats","Mileage","Power"], axis=1) # car price에 영향을 안 미친다고 생각하는 칼럼 삭제

In [165]:
data.head()

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Price
0,2010,72000,CNG,Manual,First,998.0,1.75
1,2015,41000,Diesel,Manual,First,1582.0,12.5
2,2011,46000,Petrol,Manual,First,1199.0,4.5
3,2012,87000,Diesel,Manual,First,1248.0,6.0
4,2013,40670,Diesel,Automatic,Second,1968.0,17.74


### 데이터 전처리

In [154]:
data['Engine'] = data['Engine'].replace('CC','',regex=True)
data['Engine']= data['Engine'].astype('float')
data['Engine'] = data['Engine'].fillna(data['Engine'].mean())   

In [167]:
data['Fuel_Type'].unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG', 'Electric'], dtype=object)

In [168]:
def encode_Fuel_Typeint(Fuel_Type):
    if Fuel_Type == 'CNG':
        return 1
    if Fuel_Type == 'Diesel':
        return 2
    if Fuel_Type == 'Petrol':
        return 3
    if Fuel_Type == 'LPG':
        return 4
    if Fuel_Type == 'Electric':
        return 5

In [169]:
Fuel_Type = data['Fuel_Type']
encode_Fuel_Type = Fuel_Type.apply(encode_Fuel_Typeint)

In [170]:
print(encode_Fuel_Type[:5])

0    1
1    2
2    3
3    2
4    2
Name: Fuel_Type, dtype: int64


In [171]:
data['Transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [172]:
def encode_Transmissionint(Transmission):
    if Transmission == 'Manual':
        return 1
    if Transmission == 'Automatic':
        return 2

In [173]:
Transmission = data['Transmission']
encode_Transmission = Transmission.apply(encode_Transmissionint)

In [174]:
print(encode_Transmission[:5])

0    1
1    1
2    1
3    1
4    2
Name: Transmission, dtype: int64


In [159]:
data['Owner_Type'].unique()

array(['First', 'Second', 'Fourth & Above', 'Third'], dtype=object)

In [175]:
def encode_Owner_Typeint(Owner_Type):
    if Owner_Type == 'First':
        return 1
    if Owner_Type == 'Second':
        return 2
    if Owner_Type == 'Fourth & Above':
        return 3
    if Owner_Type == 'Third':
        return 4

In [177]:
Owner_Type = data['Owner_Type']
encode_Owner_Type = Owner_Type.apply(encode_Owner_Typeint)

In [178]:
print(encode_Owner_Type[:5])

0    1
1    1
2    1
3    1
4    2
Name: Owner_Type, dtype: int64


In [179]:
data['Fuel_Type'] = encode_Fuel_Type
data['Transmission'] = encode_Transmission
data['Owner_Type'] = encode_Owner_Type


In [180]:
data.head()

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Price
0,2010,72000,1,1,1,998.0,1.75
1,2015,41000,2,1,1,1582.0,12.5
2,2011,46000,3,1,1,1199.0,4.5
3,2012,87000,2,1,1,1248.0,6.0
4,2013,40670,2,2,2,1968.0,17.74


In [181]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               6019 non-null   int64  
 1   Kilometers_Driven  6019 non-null   int64  
 2   Fuel_Type          6019 non-null   int64  
 3   Transmission       6019 non-null   int64  
 4   Owner_Type         6019 non-null   int64  
 5   Engine             6019 non-null   float64
 6   Price              6019 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 329.3 KB


In [182]:
data.isnull().sum()

Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Engine               0
Price                0
dtype: int64

### OLS처리

In [183]:
data1 = data.drop(['Price'],axis=1)   # Feature에서 Target Data 분리

y = data[['Price']] # Target 변수저장


data1.head()

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine
0,2010,72000,1,1,1,998.0
1,2015,41000,2,1,1,1582.0
2,2011,46000,3,1,1,1199.0
3,2012,87000,2,1,1,1248.0
4,2013,40670,2,2,2,1968.0


In [185]:
x_data1 = sm.add_constant(data1, has_constant = "add")

In [186]:
multi_model = sm.OLS(y, x_data1)
fitted_multi_model = multi_model.fit()

# summary함수를 통해 OLS 결과 출력
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.611
Model:,OLS,Adj. R-squared:,0.611
Method:,Least Squares,F-statistic:,1575.0
Date:,"Wed, 01 Feb 2023",Prob (F-statistic):,0.0
Time:,19:30:26,Log-Likelihood:,-20232.0
No. Observations:,6019,AIC:,40480.0
Df Residuals:,6012,BIC:,40530.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2093.0601,62.588,-33.442,0.000,-2215.755,-1970.365
Year,1.0323,0.031,33.304,0.000,0.972,1.093
Kilometers_Driven,-1.145e-07,1.01e-06,-0.113,0.910,-2.1e-06,1.87e-06
Fuel_Type,-0.4207,0.192,-2.196,0.028,-0.796,-0.045
Transmission,7.3388,0.234,31.330,0.000,6.880,7.798
Owner_Type,0.0589,0.182,0.323,0.747,-0.299,0.417
Engine,0.0097,0.000,50.711,0.000,0.009,0.010

0,1,2,3
Omnibus:,4673.085,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,264110.95
Skew:,3.23,Prob(JB):,0.0
Kurtosis:,34.802,Cond. No.,75500000.0


In [None]:
# Kilometers_Driven 과 Owner_Type의 P-value가 0.910,0747으로 Price에 영향을 많이 주지않는 변수로 확인