In [108]:
import pandas as pd
import numpy as np

# Questions

1. Create a series from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mean using .apply. 
2. Create a series from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mean using np.select. 
3. Time the differences between the previous two solutions to see which is faster. 
4. Replace the missing values of a numeric series with the median value. 
5. Clip the values of a numeric series to between to 10th and 90th percentiles.
6. Using a categorical column, replace any value that is not in the top 5 most frequent values with 'Other'.
7. Using a categorical column, replace any value that is not in the top 10 most frequent values with 'Other'.
8. Make a function that takes a categorical series and a number (n) and returns a replace series that replaces any value that is not in the top n most frequent values with 'Other'.
9. Using a numeric column, bin it into 10 groups that have the same width.
10. Using a numeric column, bin it into 10 groups that have equal sized bins.

In [109]:
vehicles = pd.read_csv('vehicles.csv')

  vehicles = pd.read_csv('vehicles.csv')


In [110]:
city_mpg = vehicles.city08

### Question 1: Create a series from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mean using `.apply`

#### Create the Function

In [111]:
def high_low(val):
    if val >= vehicles.city08.mean():
        return 'high'
    else:
        return 'low'


#### Write some tests
There are three tests here:
1. Check when the value is above the mean
2. Check when the value is equal to the mean
3. Check when the value is below the mean

In [112]:
def test_high_low_high_value_above_mean():
    city08_mean = vehicles.city08.mean()
    test_value = city08_mean + 1
    assert high_low(test_value) == 'high'
test_high_low_high_value_above_mean()

In [113]:
def test_high_low_high_value_equal_mean():
    city08_mean = vehicles.city08.mean()
    test_value = city08_mean
    assert high_low(test_value) == 'high'
test_high_low_high_value_equal_mean()

In [114]:
def test_high_low_low_value_below_mean():
    city08_mean = vehicles.city08.mean()
    test_value = city08_mean - 1
    assert high_low(test_value) == 'low'
test_high_low_low_value_below_mean()

In [115]:
city_mpg.apply(high_low)

0        high
1         low
2        high
3         low
4         low
         ... 
41139    high
41140    high
41141     low
41142     low
41143     low
Name: city08, Length: 41144, dtype: object

### Question 2: Create a series from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mean using `np.select`

In working on this exercise, I was a bit confused by the example given in Section 9.2. Matt compressed too much of what was going on for the `np.select` which made it difficult for me to understand. 

Once I got to the [numpy documentation](https://numpy.org/doc/stable/reference/generated/numpy.select.html) it was a bit more clear. This is why I have written the answer below the way I did ... to hopefully provide a bit more clarity for myself in the future

In [116]:
condition_list = [city_mpg>=city_mpg.mean()]
choice_list = ['high']
default_value = 'low'
pd.Series(np.select(condition_list, choice_list, default_value))

0        high
1         low
2        high
3         low
4         low
         ... 
41139    high
41140    high
41141     low
41142     low
41143     low
Length: 41144, dtype: object

### Question 3: Time the differences between the previous two solutions to see which is faster. 

In [117]:
%%timeit
city_mpg.apply(high_low)

3.63 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [118]:
%%timeit
pd.Series(np.select(condition_list, choice_list, default_value))

3.17 ms ± 496 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Answer 3
`city_mpg.apply(high_low)` took on average 3.18 seconds to complete

`pd.Series(np.select(condition_list, choice_list, default_value))` took on average 2.44 ms to complete

This makes the np.select more than **1000x faster**

### Question 4: Replace the missing values of a numeric series with the median value. 

In [60]:
displ = vehicles.displ
missing = displ.isna()
displ_median = displ.median()
displ.loc[7136:7142]

7136    3.3
7137    3.3
7138    NaN
7139    NaN
7140    3.0
7141    3.2
7142    1.8
Name: displ, dtype: float64

In [59]:
displ.fillna(displ_median).loc[7136:7142]

7136    3.3
7137    3.3
7138    3.0
7139    3.0
7140    3.0
7141    3.2
7142    1.8
Name: displ, dtype: float64

### Question 5: Clip the values of a numeric series to between to 10th and 90th percentiles.

In [65]:
lower_bound = displ.quantile(.1)
upper_bound = displ.quantile(.9)
lower_bound, upper_bound

(1.8, 5.3)

Examples of clipping on upper bound

In [68]:
displ.loc[46:47].clip(lower_bound, upper_bound)

46    3.0
47    5.3
Name: displ, dtype: float64

In [69]:
displ.loc[46:47]

46    3.0
47    5.9
Name: displ, dtype: float64

Examples of clipping on lower bound

In [73]:
displ.loc[5:12].clip(lower_bound, upper_bound)

5     1.8
6     1.8
7     1.8
8     1.8
9     1.8
10    1.8
11    2.0
12    2.0
Name: displ, dtype: float64

In [72]:
displ.loc[5:12]

5     1.8
6     1.8
7     1.6
8     1.6
9     1.8
10    1.8
11    2.0
12    2.0
Name: displ, dtype: float64

### Question 6: Using a categorical column, replace any value that is not in the top 5 most frequent values with 'Other'.

In [86]:
eng_dscr = vehicles.eng_dscr
vc = vehicles.eng_dscr.value_counts()
vc

(FFS)                               8827
SIDI                                5526
(FFS) CA model                       926
(FFS)      (MPFI)                    734
FFV                                  701
                                    ... 
B308E5 FFS,TURBO                       1
5.4E-R FFS MPFI                        1
V-6 FFS                                1
(GUZZLER)  (FFS)      (S-CHARGE)       1
R-ENG (FFS,TRBO)                       1
Name: eng_dscr, Length: 557, dtype: int64

In [82]:
top5 = vc.index[:5]
top10 = vc.index[:10]
def generalize(val):
    if val in top5:
        return val
    else:
        return 'Other'

In [85]:
eng_dscr.apply(generalize)

0        (FFS)
1        Other
2        (FFS)
3        Other
4        Other
         ...  
41139    (FFS)
41140    (FFS)
41141    (FFS)
41142    (FFS)
41143    Other
Name: eng_dscr, Length: 41144, dtype: object

### Question 7: Using a categorical column, replace any value that is not in the top 10 most frequent values with 'Other'.

In [87]:
def generalize10(val):
    if val in top10:
        return val
    else:
        return 'Other'

In [89]:
eng_dscr.apply(generalize10)

0             (FFS)
1             Other
2             (FFS)
3             Other
4        (FFS,TRBO)
            ...    
41139         (FFS)
41140         (FFS)
41141         (FFS)
41142         (FFS)
41143    (FFS,TRBO)
Name: eng_dscr, Length: 41144, dtype: object

### Question 8: Make a function that takes a categorical series and a number (n) and returns a replace series that replaces any value that is not in the top n most frequent values with 'Other'.

In [106]:
def generalize_n(s: pd.Series, n: int) -> pd.Series: 
    vc = s.value_counts()
    top_n = vc.index[:n]
    result = s.where(s.isin(top_n), 'Other')
    return result

Now, we write some tests to ensure that the function does what we expect it to

In [107]:
assert (generalize_n(eng_dscr, 5) != eng_dscr.apply(generalize)).sum() == 0
assert (generalize_n(eng_dscr, 10) != eng_dscr.apply(generalize10)).sum() == 0

### Question 9: Using a numeric column, bin it into 10 groups that have the same width.

In [95]:
pd.cut(displ, 10)

0        (1.68, 2.52]
1         (4.2, 5.04]
2        (1.68, 2.52]
3        (5.04, 5.88]
4        (1.68, 2.52]
             ...     
41139    (1.68, 2.52]
41140    (1.68, 2.52]
41141    (1.68, 2.52]
41142    (1.68, 2.52]
41143    (1.68, 2.52]
Name: displ, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(-0.0084, 0.84] < (0.84, 1.68] < (1.68, 2.52] < (2.52, 3.36] ... (5.04, 5.88] < (5.88, 6.72] < (6.72, 7.56] < (7.56, 8.4]]

### Question 10: Using a numeric column, bin it into 10 groups that have equal sized bins.

In [96]:
pd.qcut(displ, 10)

0        (1.8, 2.0]
1        (4.6, 5.3]
2        (2.0, 2.4]
3        (4.6, 5.3]
4        (2.0, 2.4]
            ...    
41139    (2.0, 2.4]
41140    (2.0, 2.4]
41141    (2.0, 2.4]
41142    (2.0, 2.4]
41143    (2.0, 2.4]
Name: displ, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(-0.001, 1.8] < (1.8, 2.0] < (2.0, 2.4] < (2.4, 2.5] ... (3.5, 3.9] < (3.9, 4.6] < (4.6, 5.3] < (5.3, 8.4]]

In [122]:
def test_function(my_list)->int:
    return my_list[0] + my_list[1]

In [126]:
my_list = [1]
test_function(my_list)
assert sum(my_list) == test_function(my_list)

IndexError: list index out of range