# Question: How are the cars in this dataset grouped together?

# Import Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans

# Load Dataset

In [3]:
Mpg = sns.load_dataset('mpg')

In [4]:
Mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


# Drop Categorical Variables and Missing Values

In [5]:
MpgTrimmed = Mpg.drop(['origin', 'name'], axis = 1)

In [6]:
MpgTrimmed.dropna(inplace=True)

In [7]:
print(MpgTrimmed)

      mpg  cylinders  displacement  horsepower  weight  acceleration  \
0    18.0          8         307.0       130.0    3504          12.0   
1    15.0          8         350.0       165.0    3693          11.5   
2    18.0          8         318.0       150.0    3436          11.0   
3    16.0          8         304.0       150.0    3433          12.0   
4    17.0          8         302.0       140.0    3449          10.5   
..    ...        ...           ...         ...     ...           ...   
393  27.0          4         140.0        86.0    2790          15.6   
394  44.0          4          97.0        52.0    2130          24.6   
395  32.0          4         135.0        84.0    2295          11.6   
396  28.0          4         120.0        79.0    2625          18.6   
397  31.0          4         119.0        82.0    2720          19.4   

     model_year  
0            70  
1            70  
2            70  
3            70  
4            70  
..          ...  
393      

In [9]:
MpgTrimmed.count()

mpg             392
cylinders       392
displacement    392
horsepower      392
weight          392
acceleration    392
model_year      392
dtype: int64

In [10]:
MpgTrimmed.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


# Perform k-Means

# Convert floats into integers

In [11]:
MpgTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 24.5 KB


*Looks like mpg, displacement, horsepower, acceleration are all floats, so they will need to be converted to integers*

In [12]:
MpgTrimmed.mpg = MpgTrimmed.mpg.astype(int)

In [13]:
MpgTrimmed.displacement = MpgTrimmed.displacement.astype(int)
MpgTrimmed.horsepower = MpgTrimmed.horsepower.astype(int)
MpgTrimmed.acceleration = MpgTrimmed.acceleration.astype(int)

# Perform K-Means Clustering

## Testing 2 Clusters

In [14]:
kmeans = KMeans(n_clusters = 2)
kmeans.fit(MpgTrimmed)

KMeans(n_clusters=2)

In [15]:
MpgTrimmed['Group'] = kmeans.labels_

In [16]:
MpgTrimmed.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,Group
0,18,8,307,130,3504,12,70,0
1,15,8,350,165,3693,11,70,0
2,18,8,318,150,3436,11,70,0
3,16,8,304,150,3433,12,70,0
4,17,8,302,140,3449,10,70,0


In [17]:
MpgTrimmed.groupby('Group').mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,16.314103,7.237179,301.653846,137.564103,3879.532051,14.237179,74.762821
1,27.889831,4.305085,123.521186,82.59322,2381.381356,15.813559,76.783898


*If you use two clusters, it looks like the first cluster contains cars that go slower, have more cylinders on average, have greater displacement, more horsepower, are heavier, accelerate slower, and are slightly older.  In summation: older trucks perhaps*

## Testing 3 Clusters

In [19]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(MpgTrimmed)

KMeans(n_clusters=3)

In [20]:
MpgTrimmed['Group'] = kmeans.labels_

In [21]:
MpgTrimmed.groupby('Group').mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,20.590164,5.819672,212.614754,105.401639,3162.581967,15.516393,76.352459
1,14.533333,7.866667,344.144444,157.811111,4236.322222,13.2,74.011111
2,29.483333,4.038889,107.205556,77.166667,2222.827778,15.955556,76.711111


*Ok, now with three clusters, it looks like you have a group that gets better mpg and has fewer cylinders, is low on horsepower and is new and light. Probably little sedans.*

*Then you have the original big, heavy, and slow group.* 

*The third one seems to be midrange cars*

## Testing 4 Clusters

In [22]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(MpgTrimmed)

KMeans(n_clusters=4)

In [23]:
MpgTrimmed['Group'] = kmeans.labels_

In [24]:
MpgTrimmed.groupby('Group').mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,18.179775,6.640449,259.966292,116.808989,3484.483146,15.337079,75.58427
1,13.898551,8.0,356.536232,165.130435,4366.594203,12.782609,73.608696
2,24.418367,4.704082,154.346939,94.295918,2746.438776,15.326531,77.112245
3,30.566176,3.977941,98.125,72.948529,2107.705882,16.205882,76.625


*Adding a fourth group in means that it looks like Group 2 becomes the oldest, heaviest, slowest group yet!*

*Can probably stop now, since there doesn't seem to be a lot of differention between groups. It is subjective, but looks like three clusters may be optimal.*