# Importing the dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('car_prices.csv')

# Learning the dataset

In [3]:
df.head(3)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558837 entries, 0 to 558836
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558837 non-null  int64  
 1   make          548536 non-null  object 
 2   model         548438 non-null  object 
 3   trim          548186 non-null  object 
 4   body          545642 non-null  object 
 5   transmission  493485 non-null  object 
 6   vin           558833 non-null  object 
 7   state         558837 non-null  object 
 8   condition     547017 non-null  float64
 9   odometer      558743 non-null  float64
 10  color         558088 non-null  object 
 11  interior      558088 non-null  object 
 12  seller        558837 non-null  object 
 13  mmr           558799 non-null  float64
 14  sellingprice  558825 non-null  float64
 15  saledate      558825 non-null  object 
dtypes: float64(4), int64(1), object(11)
memory usage: 68.2+ MB


In [5]:
#df['year'].value_counts()
df['make'].value_counts()

make
Ford          93554
Chevrolet     60197
Nissan        53946
Toyota        39871
Dodge         30710
              ...  
airstream         1
ford tk           1
chev truck        1
hyundai tk        1
Lotus             1
Name: count, Length: 96, dtype: int64

In [6]:
df['body'].value_counts()

body
Sedan          199437
SUV            119292
sedan           41906
suv             24552
Hatchback       21380
                ...  
cab plus 4          1
g37 coupe           1
CTS-V Wagon         1
Ram Van             1
cts wagon           1
Name: count, Length: 87, dtype: int64

In [7]:
df['state'].value_counts()

state
fl                   82945
ca                   73148
pa                   53907
tx                   45913
ga                   34750
                     ...  
3vwd17aj2fm285365        1
3vwd17aj3fm276741        1
3vwd17aj2fm258506        1
3vwd17aj4fm201708        1
3vwd17aj2fm261566        1
Name: count, Length: 64, dtype: int64

In [8]:
df['transmission'].value_counts()

transmission
automatic    475915
manual        17544
sedan            15
Sedan            11
Name: count, dtype: int64

In [9]:
df['sellingprice'].value_counts()

sellingprice
11000.0     4453
12000.0     4450
13000.0     4334
10000.0     4029
14000.0     3899
            ... 
28350.0        1
30201.0        1
131500.0       1
31550.0        1
27840.0        1
Name: count, Length: 1887, dtype: int64

In [10]:
df['mmr'].value_counts()

mmr
12500.0     1761
11600.0     1751
11650.0     1746
12150.0     1722
11850.0     1717
            ... 
172000.0       1
134000.0       1
113000.0       1
154000.0       1
164000.0       1
Name: count, Length: 1101, dtype: int64

#### Here we can see that there are values in the "transmission" column which do not make sense; a sedan is a the body of the vehicle and can be either manual or automatic. Considering that the dataset is so large we can afford to drop those rows with an incorrect value for the "transmission" column. There are still rows with empty cells so part of the preprocessing would be to populate those cells with null values and then remove later. 

#### Here we see that some values in the "state" column are out of place, we can remove those rows. 

# Data Preprocessing 

#### Firstly, we remove all the column which we are not using in the dataset. These include: model,trim , vin, condition, odometer, color , interior and saledate .

In [11]:
#code
df.drop('model', axis=1, inplace=True)
df.drop('trim', axis=1,inplace=True)
df.drop('vin', axis=1,inplace=True)
df.drop('condition', axis=1,inplace=True)
df.drop('odometer', axis=1,inplace=True)
df.drop('color', axis=1,inplace=True)
df.drop('interior', axis=1,inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558837 entries, 0 to 558836
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558837 non-null  int64  
 1   make          548536 non-null  object 
 2   body          545642 non-null  object 
 3   transmission  493485 non-null  object 
 4   state         558837 non-null  object 
 5   seller        558837 non-null  object 
 6   mmr           558799 non-null  float64
 7   sellingprice  558825 non-null  float64
 8   saledate      558825 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 38.4+ MB


#### Secondly , we clean the 'Transmission' column by removing the values 'sedan' and 'Sedan', then we will remove all the rows with null values in the column.

In [17]:
#code
df.drop(df.loc[df['transmission']=="sedan"].index, inplace=True)
df.drop(df.loc[df['transmission']=="Sedan"].index, inplace=True)

#### Thirdly, we clean the 'body', 'sellingprice',  'mmr' and 'transmission' column by removing all the null values

In [33]:
#code
df.dropna()

Unnamed: 0,year,make,body,transmission,state,seller,mmr,sellingprice,saledate
0,2015,Kia,SUV,automatic,ca,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,SUV,automatic,ca,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,Sedan,automatic,ca,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,Sedan,automatic,ca,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,Sedan,automatic,ca,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...
558831,2011,BMW,Sedan,automatic,fl,lauderdale imports ltd bmw pembrok pines,20300.0,22800.0,Tue Jul 07 2015 06:15:00 GMT-0700 (PDT)
558833,2012,Ram,Crew Cab,automatic,wa,i -5 uhlmann rv,30200.0,30800.0,Wed Jul 08 2015 09:30:00 GMT-0700 (PDT)
558834,2012,BMW,SUV,automatic,ca,financial services remarketing (lease),29800.0,34000.0,Wed Jul 08 2015 09:30:00 GMT-0700 (PDT)
558835,2015,Nissan,sedan,automatic,ga,enterprise vehicle exchange / tra / rental / t...,15100.0,11100.0,Thu Jul 09 2015 06:45:00 GMT-0700 (PDT)


#### Then we clean the 'state' column by removing the rows which have nonsensical values.

In [34]:
#code 
df['state'].unique()

array(['ca', 'tx', 'pa', 'mn', 'az', 'wi', 'tn', 'md', 'fl', 'ne', 'nj',
       'nv', 'oh', 'mi', 'ga', 'va', 'sc', 'nc', 'in', 'il', 'co', 'ut',
       'mo', 'ny', 'ma', 'pr', 'or', 'la', 'wa', 'hi', 'qc', 'ab', 'on',
       'ok', 'ms', 'nm', 'al', 'ns'], dtype=object)

usa: ALABAMA -AL
ALASKA 	-AK
ARIZONA -AZ
CALIFORNIA -CA
COLORADO -CO
FLORIDA -FL
GEORGIA -GA
HAWAII- HI
ILLINOIS -IL
INDIANA -IN
LOUISIANA -LA
MARYLAND -MD
MASSACHUSETTS -MA
MICHIGAN -MI
MINNESOTA -MN
MISSISSIPPI -MS
MISSOURI -MO
NEVADA -NV
NEW JERSEY -NJ
NEW MEXICO -NM
NEW YORK -NY
NORTH CAROLINA -NC
OHIO -OH
OKLAHOMA -OK
OREGON -OR
PENNSYLVANIA -PA
PUERTO RICO-PR 
SOUTH CAROLINA -SC
TENNESSEE -TN
TEXAS -TX
UTAH- UT
VIRGINIA -VA
WASHINGTON -WA
WISCONSIN -WI

canada: QUEBEC-QC, ALBERTA-AB, ONTARIO-ON

In [44]:
df.loc[df['state'] == "on"]

Unnamed: 0,year,make,body,transmission,state,seller,mmr,sellingprice,saledate
9463,2011,Ford,SuperCrew,notApp,on,canadian auto remarketing,22100.0,21000.0,Thu Dec 18 2014 18:30:00 GMT-0800 (PST)
9824,2011,Honda,SUV,notApp,on,coconut grove fleet and lease,17200.0,18100.0,Thu Dec 18 2014 18:30:00 GMT-0800 (PST)
10508,2011,MINI,Wagon,notApp,on,mercedes-benz,15950.0,14200.0,Thu Dec 18 2014 18:30:00 GMT-0800 (PST)
11364,2011,Toyota,Minivan,notApp,on,mercedes-benz,13200.0,12700.0,Thu Dec 18 2014 18:30:00 GMT-0800 (PST)
11381,2011,volkswagen,,notApp,on,auction direct,10300.0,6900.0,Thu Dec 18 2014 18:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...
551944,2014,Toyota,sedan,notApp,on,eagle north leasing inc,14900.0,14000.0,Tue Jun 16 2015 03:00:00 GMT-0700 (PDT)
557812,2004,Jeep,suv,notApp,on,oakville chrysler dodge jeep ram,3000.0,3600.0,Thu Jun 18 2015 11:30:00 GMT-0700 (PDT)
558322,2011,Jeep,suv,notApp,on,oakville chrysler dodge jeep ram,26800.0,25000.0,Thu Jun 18 2015 11:30:00 GMT-0700 (PDT)
558351,2010,Jeep,suv,notApp,on,robinson buick gmc ltd,5825.0,8400.0,Thu Jun 18 2015 11:30:00 GMT-0700 (PDT)


In [47]:
#df.loc[df['state'] == "ab", 'state'] = "mn"
#df.drop(df['state'] == "ne", inplace=True)

In [42]:
df['state'].value_counts()

state
fl    82945
ca    73148
pa    53907
tx    45913
ga    34750
nj    27784
il    23486
nc    21845
oh    21575
tn    20895
mo    16013
mi    15511
nv    12685
va    12027
md    11158
mn    10357
wi     9851
az     8741
co     7775
wa     7416
ma     6729
ny     5699
in     4325
sc     4251
ne     4013
on     3442
pr     2725
la     2191
ms     1851
ut     1836
qc     1245
hi     1237
or     1155
nm      171
ok       72
ns       61
al       26
Name: count, dtype: int64