# Creating fake data for car_sales (to make it a bit bigger)

This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers.

In [62]:
import pandas as pd
import numpy as np

In [63]:
car_sales = pd.read_csv('../data/car-sales.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [64]:
car_sales.Make.unique()

array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)

In [65]:
car_sales.Make.value_counts()

Make
Toyota    4
Honda     3
Nissan    2
BMW       1
Name: count, dtype: int64

## Create fake "Make" data

In [66]:
# Create fake "Make" data

toyota = ["Toyota" for i in range(0, 393)]
len(toyota)

393

In [67]:
print(toyota[:10])

['Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota']


In [68]:
honda = ["Honda" for i in range(0, 304)]
len(honda)

304

In [69]:
print(honda[:10])

['Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda', 'Honda']


In [70]:
nissan = ["Nissan" for i in range(0, 198)]
len(nissan)

198

In [71]:
print(nissan[:10])

['Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan', 'Nissan']


In [72]:
bmw = ["BMW" for i in range(0, 105)]
# bmw = ["BMW" for i in range(0, 100)]
len(bmw), bmw[:10]

(105, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])

In [73]:
makes = bmw+nissan+toyota+honda
len(makes)

1000

## Create fake "Colour" data

In [74]:
car_sales.Colour.unique()

array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)

In [75]:
car_sales.Colour.value_counts()

Colour
White    4
Blue     3
Red      1
Black    1
Green    1
Name: count, dtype: int64

In [76]:
white = ["White" for i in range(0, 407)]
len(white), white[:3]

(407, ['White', 'White', 'White'])

In [77]:
blue = ["Blue" for i in range(0, 321)]
len(blue), blue[:3]

(321, ['Blue', 'Blue', 'Blue'])

In [78]:
green = ["Green" for i in range(0, 79)]
len(green), green[:3]

(79, ['Green', 'Green', 'Green'])

In [79]:
black = ["Black" for i in range(0, 99)]
len(black), black[:3]

(99, ['Black', 'Black', 'Black'])

In [80]:
red = ["Red" for i in range(0, 94)]
len(red), red[:3]

(94, ['Red', 'Red', 'Red'])

In [81]:
colours = white+blue+green+black+red
len(colours)

1000

In [82]:
import random
colours_shuffled = random.sample(colours, len(colours))
len(colours_shuffled)

1000

In [83]:
print(colours_shuffled[:10])

['White', 'White', 'Blue', 'Red', 'Blue', 'White', 'Blue', 'White', 'Green', 'White']


## Create fake Odometer (KM) data

In [84]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [85]:
odometer = [random.randint(9789, 250000) for i in range(0, 1000)]
len(odometer)

1000

In [86]:
print(odometer[:10])

[195674, 66058, 82832, 135913, 18964, 97939, 233087, 177770, 36613, 189139]


## Create fake "Doors" data

In [87]:
five_doors = [5 for i in range(0, 79)]
three_doors = [3 for i in range(0, 65)]
four_doors = [4 for i in range(0, 856)]
doors = five_doors + three_doors + four_doors
doors_shuffled = random.sample(doors, len(doors))

In [88]:
print(doors_shuffled)

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 4, 4, 4, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 3, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 5, 3, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 4, 5, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 5, 4, 4, 4, 4, 5, 4, 4, 4, 4, 

## Create fake "Price" data

In [89]:
makes_series = pd.Series(makes)
makes_series.value_counts()

Toyota    393
Honda     304
Nissan    198
BMW       105
Name: count, dtype: int64

In [90]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [91]:
car_sales[car_sales["Make"] == "Toyota"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
8,Toyota,White,60000,4,"$6,250.00"


In [92]:
car_sales[car_sales["Make"] == "Honda"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
1,Honda,Red,87899,4,"$5,000.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"


In [93]:
car_sales[car_sales["Make"] == "Nissan"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
4,Nissan,White,213095,4,"$3,500.00"
9,Nissan,White,31600,4,"$9,700.00"


In [94]:
prices = [random.randint(5000, 30000) for i in range(0, 1000)]
len(prices)

1000

In [95]:
print(prices[:30])

[9125, 21770, 28444, 16843, 6076, 25703, 15157, 18823, 15135, 10887, 8808, 5469, 21789, 18682, 23551, 7768, 28638, 11899, 5843, 5709, 27952, 25746, 27422, 18881, 28360, 11636, 12537, 25322, 24579, 11206]


## Create base dataframe with manufactured data

In [96]:
fake_sales = pd.DataFrame(columns = ["Make", "Colour", "Odometer (KM)", "Doors", "Price"])
fake_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price


In [97]:
len(colours_shuffled), len(odometer), len(doors), len(prices), len(makes)

(1000, 1000, 1000, 1000, 1000)

In [98]:
fake_sales["Make"] = makes
fake_sales["Colour"] = colours_shuffled
fake_sales["Odometer (KM)"] = odometer
fake_sales["Doors"] = doors
fake_sales["Price"] = prices

In [99]:
fake_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,BMW,White,195674,5,9125
1,BMW,White,66058,5,21770
2,BMW,Blue,82832,5,28444
3,BMW,Red,135913,5,16843
4,BMW,Blue,18964,5,6076


## Adjust the price column

For the price column:
* Generate random numbers between the certain values
* If the Odometer reading is above 100K, multiply price by 0.75
* If the Odometer reading is above 150K, multiply price by 0.6
* If the Odometer reading is above 200K, multiply price by 0.5
* If the Make column is BMW, multiply price by 1.5 + 2500
* If the Make column is Toyota, multuply price by 1.2
* If the Make is Nissan, multiply price by 1.1
* If the Make is Honda, add $1000 to price

In [100]:
fake_sales["Price"].describe()

count     1000.000000
mean     17478.029000
std       7210.549494
min       5020.000000
25%      11338.500000
50%      17361.000000
75%      23963.750000
max      29995.000000
Name: Price, dtype: float64

In [101]:
def price_od(price, odometer):
    """
    Changes price according to Odometer values.
    """
    if 100000 <= odometer <= 150000:
        return round(price * 0.75)
    elif 150001 <= odometer <= 200000:
        return round(price * 0.6)
    elif 200001 <= odometer:
        return round(price * 0.5)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_od(x["Price"], 
                                                          x["Odometer (KM)"]), 
                                                          axis=1)

fake_sales["Price"].describe()

count     1000.000000
mean     13522.083000
std       6802.528028
min       2549.000000
25%       8014.250000
50%      12686.000000
75%      17851.000000
max      29992.000000
Name: Price, dtype: float64

In [102]:
def price_make(price, make):
    """
    Manipulates the price base on the cars make.
    """
    if make == "BMW":
        return round((price * 1.5) + random.randint(3000, 10000))
    elif make == "Toyota":
        return round(price * 1.2)
    elif make == "Nissan":
        return round(price * 1.1)
    elif make == "Honda":
        return round(price + 1000)
    else:
        return price

fake_sales["Price"] = fake_sales.apply(lambda x: price_make(x["Price"], 
                                                            x["Make"]), 
                                                            axis=1)

fake_sales["Price"].describe()

count     1000.000000
mean     16543.549000
std       8632.092249
min       2804.000000
25%       9831.500000
50%      15071.000000
75%      21381.750000
max      49214.000000
Name: Price, dtype: float64

In [103]:
fake_sales = fake_sales.sample(frac=1)

In [104]:
fake_sales.reset_index(drop=True, inplace=True)
fake_sales.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,Green,208514,4,10135
1,Toyota,Black,238616,4,8844
2,Toyota,Blue,171655,4,19921
3,Nissan,Blue,202539,3,12234
4,Toyota,Green,172702,4,20132
5,Honda,White,47875,4,25269
6,Honda,White,44245,4,27490
7,Honda,Black,156914,4,11247
8,BMW,Black,71265,5,49214
9,Honda,Black,107013,4,23392


# NEXT:
* Drop some values at random (to manufacture missing data)
* Build a random forest model to predict (this will involve changing categories to numerical data)

In [105]:
# Export the data
fake_sales.to_csv("../data/car-sales-extended.csv")

## Make missing data in car_sales_extended

In [106]:
sales_ext = pd.read_csv("../data/car-sales-extended.csv")

In [107]:
len(sales_ext)

1000

In [108]:
sales_ext

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,Green,208514,4,10135
1,1,Toyota,Black,238616,4,8844
2,2,Toyota,Blue,171655,4,19921
3,3,Nissan,Blue,202539,3,12234
4,4,Toyota,Green,172702,4,20132
...,...,...,...,...,...,...
995,995,Toyota,Black,153178,4,14032
996,996,Toyota,Red,107610,4,26966
997,997,Toyota,Blue,152855,4,4717
998,998,Honda,Red,132146,4,17280


### What we want to do
* Remove some rows values or replace them at random
    * E.g. replace strings with empty strings ("")
    * And numbers with NaN or something similar...
* Want to keep the number of samples the same, order the same, just put some holes in it

One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies.

In [109]:
# Replicate the df
sales_ext_dropped = sales_ext

In [110]:
# Make column
np.random.seed(10)
make_idx = np.random.randint(0, 1000, 50)

In [111]:
make_idx

array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881,
         8,  73, 256, 490,  40, 502, 420, 371, 528, 356, 239, 395,  54,
       344, 363, 122, 574, 545, 200, 868, 974, 689, 691,  54,  77, 453,
        13, 755, 409, 382, 653, 860, 342, 798, 670,  89, 652])

In [112]:
for value in make_idx:
    sales_ext_dropped.loc[value, "Make"] = ""

In [113]:
sales_ext_dropped["Make"][266]

'Toyota'

In [114]:
# Colour column
np.random.seed(42)
colour_idx = np.random.randint(0, 1000, 50)
for value in colour_idx:
    sales_ext_dropped.loc[value, "Colour"] = ""

In [115]:
# Odometer (KM) column
np.random.seed(1)
odom_idx = np.random.randint(0, 1000, 50)
for value in odom_idx:
    sales_ext_dropped.loc[value, "Odometer (KM)"] = None

In [116]:
# Doors column
np.random.seed(2)
door_idx = np.random.randint(0, 1000, 50)
for value in door_idx:
    sales_ext_dropped.loc[value, "Doors"] = None

In [117]:
# Price column
np.random.seed(3)
price_idx = np.random.randint(0, 1000, 50)
for value in price_idx:
    sales_ext_dropped.loc[value, "Price"] = None

In [118]:
sales_ext_dropped.head()

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,Green,208514.0,4.0,10135.0
1,1,Toyota,Black,238616.0,4.0,8844.0
2,2,Toyota,Blue,171655.0,4.0,19921.0
3,3,Nissan,Blue,202539.0,3.0,12234.0
4,4,Toyota,Green,172702.0,4.0,20132.0


In [119]:
# Check how many of our values are missing/NaN
sales_ext_dropped.isna().sum()

Unnamed: 0        0
Make              0
Colour            0
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [120]:
# Export dataframe with random missing values
sales_ext_dropped.to_csv("../data/car-sales-extended-missing-data.csv", index=False)