In [232]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# Data taen from https://www.kaggle.com/toramky/automobile-dataset

df = pd.read_csv("./Automobile_data.csv", sep=',',na_values='?')
df.head(10)

old_names = df.columns.values
new_names = [name.replace('-','_') for name in old_names]
df.rename(columns = dict(zip(old_names, new_names)), inplace = True)
df.columns.values

array(['symboling', 'normalized_losses', 'make', 'fuel_type',
       'aspiration', 'num_of_doors', 'body_style', 'drive_wheels',
       'engine_location', 'wheel_base', 'length', 'width', 'height',
       'curb_weight', 'engine_type', 'num_of_cylinders', 'engine_size',
       'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower',
       'peak_rpm', 'city_mpg', 'highway_mpg', 'price'], dtype=object)

In [233]:
df = df.dropna(subset = ['price', 'drive_wheels','highway_mpg', 'city_mpg','body_style','horsepower',
                           'num_of_doors','aspiration'])
df.head(10)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
5,2,,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
7,1,,audi,gas,std,four,wagon,fwd,front,105.8,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,18920.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [237]:
def first_stage(df, price, drive_wheels):
    """
    Recommends a list of cars that satisfies the specified price and drive_wheels
    
    Inputs:
    - df: the data frame to filter
    - price: tuple of two floats-- the lower bound and upper bound 
    - drive_wheels: a list of strings of the preferred type of drive_wheels
    
    Outputs:
    - filtered: a dataframe containing objects satisfying the conditions
    """
    mask_drive_wheels = df.drive_wheels
    filtered = df[(price[0] <= df.price) &(df.price <= price[1]) & (df.drive_wheels.isin(drive_wheels))]
    return filtered
# Test
fil1 = first_stage(df, (10000,30000), ['rwd','fwd'])
fil1


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
5,2,,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,15250.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,17710.0
7,1,,audi,gas,std,four,wagon,fwd,front,105.8,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,18920.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.40,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.50,2.80,8.8,101.0,5800.0,23,29,16430.0
11,0,192.0,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.50,2.80,8.8,101.0,5800.0,23,29,16925.0


In [238]:
def second_stage(df, num_of_doors, aspiration):
    """
    Recommends a list of cars that satisfies the specified highway_mpg, horsepower, body_style
    
    Inputs:
    - df: the data frame to filter
    - num_of_doors: tuple of two floats-- the lower bound and upper bound 
    - aspiration: a list of strings of the preferred type of aspiration
    
    Outputs:
    - filtered: a dataframe containing objects satisfying the conditions
    """
    
    weights = [10,100] 
    ranks = []
    doors_dict = {'two': 2,
                 'four': 4}
    for index, row in df.iterrows():
        doors_score = num_of_doors[0] <= doors_dict[row['num_of_doors']] <= num_of_doors[1]
        aspiration_score = row['aspiration'] in aspiration 
        rank = weights[0]*doors_score + weights[1]*aspiration_score
        ranks.append(rank)
    df = df.assign(ranks=pd.Series(ranks).values)
    filtered = df.nlargest(int(99/100*len(df.index)), 'ranks', 'first').drop(axis=1, columns='ranks')
    return filtered
# Test
fil2 = second_stage(fil, (2,3), ['turbo'])
fil2

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
29,3,145.0,dodge,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2811,ohc,four,156,mfi,3.6,3.9,7.0,145.0,5000.0,19,24,12964.0
82,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2833,ohc,four,156,spdi,3.58,3.86,7.0,145.0,5000.0,19,24,12629.0
83,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2921,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,14869.0
84,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2926,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,14489.0
124,3,,plymouth,gas,turbo,two,hatchback,rwd,front,95.9,173.2,66.3,50.2,2818,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,12764.0
108,0,161.0,peugot,diesel,turbo,four,sedan,rwd,front,107.9,186.7,68.4,56.7,3197,l,four,152,idi,3.7,3.52,21.0,95.0,4150.0,28,33,13200.0
110,0,,peugot,diesel,turbo,four,wagon,rwd,front,114.2,198.9,68.4,58.7,3430,l,four,152,idi,3.7,3.52,21.0,95.0,4150.0,25,25,13860.0
174,-1,65.0,toyota,diesel,turbo,four,sedan,fwd,front,102.4,175.6,66.5,54.9,2480,ohc,four,110,idi,3.27,3.35,22.5,73.0,4500.0,30,33,10698.0
192,0,,volkswagen,diesel,turbo,four,sedan,fwd,front,100.4,180.2,66.9,55.1,2579,ohc,four,97,idi,3.01,3.4,23.0,68.0,4500.0,33,38,13845.0
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0


In [263]:
from kmodes.kmodes.kprototypes import KPrototypes

# kmodes library: https://github.com/nicodv/kmodes

def third_stage(df, highway_mpg, city_mpg, body_style, horsepower):
    """
    Recommends a list of cars that satisfies the specified highway_mpg, horsepower, body_style.
    Using k-prototype algorithm. 

    Inputs:
    - df: the data frame to filter
    - highway_mpg, city_mpg, horsepower: tuples of two floats-- the lower bound and upper bound 
    - body_style: string, the preferred type of body_style
    """
    X_train = df[['highway_mpg', 'city_mpg', 'body_style', 'horsepower']].values 
    kproto = KPrototypes(n_clusters=4, init='Huang', verbose=0)
    clusters = kproto.fit(X_train, categorical=[2]).labels_
    X_test = np.array([[np.mean(highway_mpg),np.mean(city_mpg), body_style, np.mean(horsepower)]])
    X_df =  pd.DataFrame(X_test)
    X_df[[0,1,3]]=X_df[[0,1,3]].astype('float')
    X_test = X_df.values
    labels = kproto.predict(X_test, categorical=[2])
    fil_mask = fil2.assign(cluster=clusters)
    return fil_mask[fil_mask.cluster==labels[0]].drop(axis=1, columns='cluster')
# Test
fil3 = third_stage(fil2, (20,30), (30,50), 'hatchback', (120,150))
fil3

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
29,3,145.0,dodge,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2811,ohc,four,156,mfi,3.6,3.9,7.0,145.0,5000.0,19,24,12964.0
82,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2833,ohc,four,156,spdi,3.58,3.86,7.0,145.0,5000.0,19,24,12629.0
83,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2921,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,14869.0
84,3,,mitsubishi,gas,turbo,two,hatchback,fwd,front,95.9,173.2,66.3,50.2,2926,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,14489.0
124,3,,plymouth,gas,turbo,two,hatchback,rwd,front,95.9,173.2,66.3,50.2,2818,ohc,four,156,spdi,3.59,3.86,7.0,145.0,5000.0,19,24,12764.0
101,0,128.0,nissan,gas,std,four,sedan,fwd,front,100.4,181.7,66.5,55.1,3095,ohcv,six,181,mpfi,3.43,3.27,9.0,152.0,5200.0,17,22,13499.0
102,0,108.0,nissan,gas,std,four,wagon,fwd,front,100.4,184.6,66.5,56.1,3296,ohcv,six,181,mpfi,3.43,3.27,9.0,152.0,5200.0,17,22,14399.0
103,0,108.0,nissan,gas,std,four,sedan,fwd,front,100.4,184.6,66.5,55.1,3060,ohcv,six,181,mpfi,3.43,3.27,9.0,152.0,5200.0,19,25,13499.0
