# Assignment8

In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

# How Much is Your Car Worth?

Data about the retail price of 2005 General Motors cars can be found in `car_data.csv`.

The columns are:

1. Price: suggested retail price of the used 2005 GM car in excellent condition.
2. Mileage: number of miles the car has been driven
3. Make: manufacturer of the car such as Saturn, Pontiac, and Chevrolet
4. Model: specific models for each car manufacturer such as Ion, Vibe, Cavalier
5. Trim (of car): specific type of car model such as SE Sedan 4D, Quad Coupe 2D          
6. Type: body type such as sedan, coupe, etc.      
7. Cylinder: number of cylinders in the engine        
8. Liter: a more specific measure of engine size     
9. Doors: number of doors           
10. Cruise: indicator variable representing whether the car has cruise control (1 = cruise)
11. Sound: indicator variable representing whether the car has upgraded speakers (1 = upgraded)
12. Leather: indicator variable representing whether the car has leather seats (1 = leather)

## Tasks, Part 1

1. Find the linear regression equation for mileage vs price.
2. Chart the original data and the equation on the chart.
3. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)

## Tasks, Part 2

1. Use mileage, cylinders, liters, doors, cruise, sound, and leather to find the linear regression equation.
2. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)
3. Find the combination of the factors that is the best predictor for price.

## Tasks, Hard Mode

1. Research dummy variables in scikit-learn to see how to use the make, model, and body type.
2. Find the best combination of factors to predict price.

# Part-1 Solution

In [33]:
df = pd.read_csv("car_data.csv")

In [34]:
df.shape

(804, 12)

In [35]:
df.head()

Unnamed: 0,Price,Mileage,Make,Model,Trim,Type,Cylinder,Liter,Doors,Cruise,Sound,Leather
0,17314.103129,8221,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,1,1
1,17542.036083,9135,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,1,0
2,16218.847862,13196,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,1,0
3,16336.91314,16342,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,0,0
4,16339.170324,19832,Buick,Century,Sedan 4D,Sedan,6,3.1,4,1,0,1


In [36]:
df.isnull().sum()

Price       0
Mileage     0
Make        0
Model       0
Trim        0
Type        0
Cylinder    0
Liter       0
Doors       0
Cruise      0
Sound       0
Leather     0
dtype: int64

In [37]:
X = df.iloc[:,1]
y = df.iloc[:,0]

X = np.array(X).reshape((-1,1))
y = np.array(y).reshape((-1,1))

In [38]:
model = LinearRegression()
model.fit(X,y)

LinearRegression()

In [39]:
print("y = ",float(model.coef_),"x+",float(model.intercept_))

y =  -0.1725205027912493 x+ 24764.559006061685


In [40]:
pred = model.predict(X)
print("Score with Mileage column:",r2_score(y, pred))

Score with Mileage column: 0.02046344732348926


# Part-2 Solution

In [41]:
X2 = df.iloc[:,[1,6,7,8,9,10,11]]
y2 = df.iloc[:,0]

X2 = np.array(X2).reshape((-1,7))
y2 = np.array(y2).reshape((-1,1))

In [43]:
X2[0]

array([8.221e+03, 6.000e+00, 3.100e+00, 4.000e+00, 1.000e+00, 1.000e+00,
       1.000e+00])

In [44]:
model2 = LinearRegression()
model2.fit(X2,y2)

LinearRegression()

In [47]:
print("y = ",float(model2.coef_[0][0]),"x1+",
      float(model2.coef_[0][1]),"x2+",
      float(model2.coef_[0][2]),"x3+",
      float(model2.coef_[0][3]),"x4+",
      float(model2.coef_[0][4]),"x5+",
      float(model2.coef_[0][5]),"x6+",
      float(model2.coef_[0][6]),"x7+",
      float(model2.intercept_))

y =  -0.16974783233303692 x1+ 3792.378930782515 x2+ -787.2207316338321 x3+ -1542.745846382068 x4+ 6288.997149849932 x5+ -1993.795275698427 x6+ 3349.361619613555 x7+ 6758.755143598106


In [48]:
pred2 = model2.predict(X2)
print("Score with all columns:",r2_score(y2, pred2))

Score with all columns: 0.4462643536728379


In [56]:
l = []
for i in range(1,8):
  pca = PCA(n_components=i,random_state=0)
  pca.fit(X2)
  x=pca.transform(X2)

  model3 = LinearRegression()
  model3.fit(x,y2)

  preds3 = model3.predict(x)

  l.append(r2_score(y, preds3))
  print("With",i,"columns:",l[i-1])

With 1 columns: 0.020463447494586617
With 2 columns: 0.3504497977596356
With 3 columns: 0.3646064800564265
With 4 columns: 0.3673863089598691
With 5 columns: 0.37068136274290675
With 6 columns: 0.43879954665861454
With 7 columns: 0.446264353672838


# Part-3 Solution

In [8]:
print(df['Make'].unique())
print(df['Model'].unique())
print(df['Trim'].unique())
print(df['Type'].unique())

['Buick' 'Cadillac' 'Chevrolet' 'Pontiac' 'SAAB' 'Saturn']
['Century' 'Lacrosse' 'Lesabre' 'Park Avenue' 'CST-V' 'CTS' 'Deville'
 'STS-V6' 'STS-V8' 'XLR-V8' 'AVEO' 'Cavalier' 'Classic' 'Cobalt'
 'Corvette' 'Impala' 'Malibu' 'Monte Carlo' 'Bonneville' 'G6' 'Grand Am'
 'Grand Prix' 'GTO' 'Sunfire' 'Vibe' '9_3' '9_3 HO' '9_5' '9_5 HO'
 '9-2X AWD' 'Ion' 'L Series']
['Sedan 4D' 'CX Sedan 4D' 'CXL Sedan 4D' 'CXS Sedan 4D' 'Custom Sedan 4D'
 'Limited Sedan 4D' 'Special Ed Ultra 4D' 'DHS Sedan 4D' 'DTS Sedan 4D'
 'Hardtop Conv 2D' 'LS Hatchback 4D' 'LS Sedan 4D' 'LT Hatchback 4D'
 'LT Sedan 4D' 'SVM Hatchback 4D' 'SVM Sedan 4D' 'Coupe 2D' 'LS Coupe 2D'
 'LS Sport Coupe 2D' 'LS Sport Sedan 4D' 'Conv 2D' 'SS Sedan 4D'
 'LS MAXX Hback 4D' 'LT MAXX Hback 4D' 'MAXX Hback 4D' 'LT Coupe 2D'
 'SS Coupe 2D' 'GXP Sedan 4D' 'SE Sedan 4D' 'SLE Sedan 4D' 'GT Sedan 4D'
 'GT Coupe 2D' 'GTP Sedan 4D' 'AWD Sportwagon 4D' 'GT Sportwagon'
 'Sportwagon 4D' 'Linear Conv 2D' 'Linear Sedan 4D' 'Aero Conv 2D'
 'Aero 

In [9]:
encoded_df = pd.get_dummies(df, columns = ['Make','Model','Trim','Type'])

In [10]:
encoded_df.head(10)

Unnamed: 0,Price,Mileage,Cylinder,Liter,Doors,Cruise,Sound,Leather,Make_Buick,Make_Cadillac,...,Trim_SVM Hatchback 4D,Trim_SVM Sedan 4D,Trim_Sedan 4D,Trim_Special Ed Ultra 4D,Trim_Sportwagon 4D,Type_Convertible,Type_Coupe,Type_Hatchback,Type_Sedan,Type_Wagon
0,17314.103129,8221,6,3.1,4,1,1,1,1,0,...,0,0,1,0,0,0,0,0,1,0
1,17542.036083,9135,6,3.1,4,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,16218.847862,13196,6,3.1,4,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,16336.91314,16342,6,3.1,4,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,16339.170324,19832,6,3.1,4,1,0,1,1,0,...,0,0,1,0,0,0,0,0,1,0
5,15709.052821,22236,6,3.1,4,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
6,15230.00339,22576,6,3.1,4,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
7,15048.042184,22964,6,3.1,4,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
8,14862.09387,24021,6,3.1,4,1,0,1,1,0,...,0,0,1,0,0,0,0,0,1,0
9,15295.018267,27325,6,3.1,4,1,1,1,1,0,...,0,0,1,0,0,0,0,0,1,0


In [61]:
X3 = encoded_df.iloc[:,1:99]
y3 = encoded_df.iloc[:,0]

X3 = np.array(X3).reshape((-1,97))
y3 = np.array(y3).reshape((-1,1))

In [70]:
l = []
for i in range(1,98):
  pca2 = PCA(n_components=i,random_state=0)
  pca2.fit(X3)
  x2=pca2.transform(X3)

  model4 = LinearRegression()
  model4.fit(x2,y3)

  preds4 = model4.predict(x2)

  l.append(r2_score(y3, preds4))
  print("With",i,"columns:",l[i-1])

With 1 columns: 0.020463447476670615
With 2 columns: 0.3463746179965892
With 3 columns: 0.354844220694815
With 4 columns: 0.48399829069977074
With 5 columns: 0.5245233822364945
With 6 columns: 0.7513895392062465
With 7 columns: 0.7533272011494492
With 8 columns: 0.8157224608984136
With 9 columns: 0.8285585281684755
With 10 columns: 0.8533899276476711
With 11 columns: 0.8533802490274434
With 12 columns: 0.853538925134046
With 13 columns: 0.8606554599708126
With 14 columns: 0.9009467871524242
With 15 columns: 0.901733068102241
With 16 columns: 0.9029650988621798
With 17 columns: 0.9180193209118185
With 18 columns: 0.9184534308134052
With 19 columns: 0.9214494192319863
With 20 columns: 0.9226204802718533
With 21 columns: 0.9221552964316928
With 22 columns: 0.9236437276500394
With 23 columns: 0.9234788577343084
With 24 columns: 0.9253637402029359
With 25 columns: 0.9256564901542834
With 26 columns: 0.9304535534119325
With 27 columns: 0.9306495777117311
With 28 columns: 0.9336821836403763
W

In [82]:
for i in l:
  if i>=0.992:
    print("With",l.index(i)+1,"columns:",i)

With 71 columns: 0.992176471462652
With 72 columns: 0.9921857928152665
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805
With 73 columns: 0.9924082614213805


# With 73 columns out of 98, we are able to achieve a score of 99.24%