# Chapter 9: Manipulation Methods

In [1]:
import pandas as pd
import numpy as np

url = "http://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip"
df = pd.read_csv(url)
city_mpg = df.city08
highway_mpg = df.highway08

  df = pd.read_csv(url)


Create a series from a numeric column that has the value of 'high' if it is equal to or above
the mean and 'low' if it is below the mean using np.select.

In [2]:
s= pd.Series([3,5,2,7, 1,9,10, 23, 43, 12])

In [3]:
s.where(s > s.mean(), 'low').where(s <s.mean(), 'high')

Unnamed: 0,0
0,low
1,low
2,low
3,low
4,low
5,low
6,low
7,high
8,high
9,high


In [4]:
s.mean()

11.5

Replace the missing values of a numeric series with the median value.

In [9]:
s.loc[1] = np.nan

In [10]:
s

Unnamed: 0,0
0,3.0
1,
2,2.0
3,7.0
4,1.0
5,9.0
6,10.0
7,23.0
8,43.0
9,12.0


In [13]:
s.median()

9.0

In [12]:
s.fillna(s.median())

Unnamed: 0,0
0,3.0
1,9.0
2,2.0
3,7.0
4,1.0
5,9.0
6,10.0
7,23.0
8,43.0
9,12.0


Clip the values of a numeric series to between to 10th and 90th percentiles.

In [14]:
s.clip(s.quantile(0.1), s.quantile(0.9))

Unnamed: 0,0
0,3.0
1,
2,2.0
3,7.0
4,1.8
5,9.0
6,10.0
7,23.0
8,27.0
9,12.0


Using a categorical column, replace any value that is not in the top 5 most frequent values
with 'Other'.

In [17]:
make = df.make

In [18]:
make.value_counts()

Unnamed: 0_level_0,count
make,Unnamed: 1_level_1
Chevrolet,4003
Ford,3371
Dodge,2583
GMC,2494
Toyota,2071
...,...
Volga Associated Automobile,1
Panos,1
Mahindra,1
Excalibur Autos,1


In [19]:
top5 = make.value_counts().index[:5]

In [20]:
make.where(make.isin(top5), 'Other')

Unnamed: 0,make
0,Other
1,Other
2,Dodge
3,Dodge
4,Other
...,...
41139,Other
41140,Other
41141,Other
41142,Other


In [22]:
s.loc[1]=20

In [23]:
s

Unnamed: 0,0
0,3.0
1,20.0
2,2.0
3,7.0
4,1.0
5,9.0
6,10.0
7,23.0
8,43.0
9,12.0


In [24]:
pd.cut(s, 10)

Unnamed: 0,0
0,"(0.958, 5.2]"
1,"(17.8, 22.0]"
2,"(0.958, 5.2]"
3,"(5.2, 9.4]"
4,"(0.958, 5.2]"
5,"(5.2, 9.4]"
6,"(9.4, 13.6]"
7,"(22.0, 26.2]"
8,"(38.8, 43.0]"
9,"(9.4, 13.6]"


In [25]:
pd.cut(s, 10).nunique()

6

In [26]:
5.2-0.958

4.242

In [36]:
df2 = pd.DataFrame({'values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

In [39]:
df2['bins'] = pd.cut(df2['values'], bins = 3)
print(df2)

   values          bins
0       1  (0.991, 4.0]
1       2  (0.991, 4.0]
2       3  (0.991, 4.0]
3       4  (0.991, 4.0]
4       5    (4.0, 7.0]
5       6    (4.0, 7.0]
6       7    (4.0, 7.0]
7       8   (7.0, 10.0]
8       9   (7.0, 10.0]
9      10   (7.0, 10.0]


In [32]:
df2 = pd.DataFrame({'values': [1, 5, 10, 15, 20, 25]})

# Create 3 equal-width bins
df2['bins'] = pd.cut(df2['values'], bins=3)

print(df2)

   values          bins
0       1  (0.976, 9.0]
1       5  (0.976, 9.0]
2      10   (9.0, 17.0]
3      15   (9.0, 17.0]
4      20  (17.0, 25.0]
5      25  (17.0, 25.0]


In [34]:
df2['bins_with_labels'] = pd.cut(df2['values'], bins=3, labels=['Low', 'Medium', 'High'])

print(df2)

   values bins_with_labels
0       1              Low
1       5              Low
2      10           Medium
3      15           Medium
4      20             High
5      25             High


In [41]:
data = {'values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
df3 = pd.DataFrame(data)

# Create 3 equal-sized bins
df3['bins'] = pd.cut(df3['values'], bins=3)
print(df3)

   values          bins
0       1  (0.991, 4.0]
1       2  (0.991, 4.0]
2       3  (0.991, 4.0]
3       4  (0.991, 4.0]
4       5    (4.0, 7.0]
5       6    (4.0, 7.0]
6       7    (4.0, 7.0]
7       8   (7.0, 10.0]
8       9   (7.0, 10.0]
9      10   (7.0, 10.0]


In [16]:
s.quantile(0.1), s.quantile(0.9)

(1.8, 27.000000000000004)

## 9.1 .apply, .where and .mask

- ``.apply`` allows you to apply a function element-wise to every value.
- If we pass in a numpy function that works on an array, it will broadcast the operation to the series
- However, ``.apply`` is not very efficient because the function is called once for every value. This breaks out of the fast vectorized code paths we can leverage in pandas.

In [None]:
def gt20(val):
    return val > 20

In [None]:
%%timeit
# using apply function
city_mpg.apply(gt20)

4.59 ms ± 39.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
# using broadcasting .gt method
city_mpg.gt(20)

78.1 µs ± 166 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
make = df.make

In [None]:
make.value_counts()

Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: make, Length: 136, dtype: int64

- In the example  below, the function ``generalize_top5`` is being called once for every value
- A faster way is using the ``.where`` method.
- ``.where`` method keeps values where the index is True and uses the "other" parameter to specify values for False

In [None]:
# keep the first five entries in the index and replace everything else with other
top5 = make.value_counts().index[:5]

In [None]:
def generalize_top5(val):
    if val in top5:
        return val
    return "Other"

In [None]:
%%timeit
# slow
make.apply(generalize_top5)

5.74 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
# better
make.where(make.isin(top5), other='Other')

1.8 ms ± 7.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


- ``.mask`` is the complement of .where method.
- Wherever the condition is False, it keeps the original values. If it is True, it replaces the value with other parameter.

In [None]:
make.mask(~make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

## 9.2 If Else and .select

In [None]:
vc = make.value_counts()
top5 = vc.index[:5]
top10 = vc.index[:10]

In [None]:
def generalize(val):
    if val in top5:
        return val
    elif val in top10:
        return 'Top10'
    else:
        return 'Other'

In [None]:
make.apply(generalize)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [None]:
(make
.where(make.isin(top5), 'Top10')
.where(make.isin(top10), 'Other'))

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

- The interface takes a list of boolean arrays and a list with corresponding replacement values

In [None]:
pd.Series(np.select([make.isin(top5), make.isin(top10)], [make, 'Top10'], 'Other'))

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Length: 41144, dtype: object

## 9.3 Missing Data

In [None]:
cyl = df.cylinders

In [None]:
(cyl
.isna()
.sum())

206

- Hard to determine why these values are missing from just the ``cylinders`` series alone
- We use the ``make`` column to give us some insight

In [None]:
missing = cyl.isna()
make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: object

## 9.4 Filling in Missing Data

- The ``.fillna`` method allows us to specify a replacement value for any missing data
- ``.dropna``
- ``.ffill``
- ``.interpolate()``

In [None]:
cyl[cyl.isna()]

7138    NaN
7139    NaN
8143    NaN
8144    NaN
8146    NaN
         ..
34563   NaN
34564   NaN
34565   NaN
34566   NaN
34567   NaN
Name: cylinders, Length: 206, dtype: float64

In [None]:
# fillna method
cyl.fillna(0).loc[7136:7141]

7136    6.0
7137    6.0
7138    0.0
7139    0.0
7140    6.0
7141    6.0
Name: cylinders, dtype: float64

## 9.5 Interpolating Data

- Another option for replacing missing data ``.interpolate`` method
- Handy if data is ordered (i.e time series) and there are holes in the data
- In the example below, index label 2 was missing. We use the index label 1 and index label 3 to infer index label 2

In [None]:
temp = pd.Series([32, 40, None, 42, 39,32])
temp

0    32.0
1    40.0
2     NaN
3    42.0
4    39.0
5    32.0
dtype: float64

In [None]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float64

## 9.6 Clipping Data

- If we have outliers in our data, we want to use the ``.clip`` method
- We can trim the values to be between 5th and 95th quantile

In [None]:
city_mpg.loc[:446]

0      19
1       9
2      23
3      10
4      17
       ..
442    15
443    15
444    15
445    15
446    31
Name: city08, Length: 447, dtype: int64

In [None]:
# trim values between 5th and 95th quantile
(city_mpg
.loc[:446]
.clip(lower=city_mpg.quantile(0.05),
      upper=city_mpg.quantile(0.95)))

0      19
1      11
2      23
3      11
4      17
       ..
442    15
443    15
444    15
445    15
446    27
Name: city08, Length: 447, dtype: int64

## 9.7 Sorting Values

In [None]:
city_mpg.sort_values()

7901       6
34557      6
37161      6
21060      6
35887      6
        ... 
34563    138
34564    140
32599    150
31256    150
33423    150
Name: city08, Length: 41144, dtype: int64

In [None]:
(city_mpg.sort_values() + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

## 9.8 Sorting the Index

In [None]:
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

## 9.9 Dropping Duplicates

- ``.drop_duplicates`` method will remove values that appear more than once
- We can determine whether to keep the first or last duplicate value found using the ``keep`` parameter
- Default value is *'first'*
- Setting it to *'last'*  will use the last value
- If we set to *'False'*, it will remove any duplicated values

In [None]:
city_mpg.drop_duplicates()

0         19
1          9
2         23
3         10
4         17
        ... 
34364    127
34409    114
34564    140
34565    115
34566    104
Name: city08, Length: 105, dtype: int64

## 9.10 Ranking Data

https://dataindependent.com/pandas/pandas-rank-rank-your-data-pd-df-rank/

- The rank() function is used to compute numerical data ranks (1 through n) along axis. By default, equal values are assigned a rank that is the average of the ranks of those values
- ``.rank`` method will return a series that keeps the original index but uses the ranks of values from the original series
- Control how ranking occurs with the ``method`` parameter
- By default, if two values are the same, their rank will be the average of the positions they take.
- Specify ``min`` to put equal values in the same rank
- Specify ``dense`` to not skip any positions

In [None]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [None]:
city_mpg.rank()

0        27060.5
1          235.5
2        35830.0
3          607.5
4        19484.0
          ...   
41139    27060.5
41140    29719.5
41141    23528.0
41142    23528.0
41143    15479.0
Name: city08, Length: 41144, dtype: float64

In [None]:
city_mpg.rank(method='min')

0        25555.0
1          136.0
2        35119.0
3          336.0
4        17467.0
          ...   
41139    25555.0
41140    28567.0
41141    21502.0
41142    21502.0
41143    13492.0
Name: city08, Length: 41144, dtype: float64

## 9.11 Replacing Data

- The ``to_replace`` parameter's value can contain a regular expression if you provide ``regex=True`` parameter

In [None]:
make.replace('Subaru', "Sub")

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               Sub
            ...    
41139           Sub
41140           Sub
41141           Sub
41142           Sub
41143           Sub
Name: make, Length: 41144, dtype: object

In [None]:
make.replace(r'(Fer)ra(r.*)',
        value=r'\2-other-\1', regex=True)


0          Alfa Romeo
1        ri-other-Fer
2               Dodge
3               Dodge
4              Subaru
             ...     
41139          Subaru
41140          Subaru
41141          Subaru
41142          Subaru
41143          Subaru
Name: make, Length: 41144, dtype: object

In [None]:
# example 1
s = pd.Series([40, 20, 30, 20, 10])
s

0    40
1    20
2    30
3    20
4    10
dtype: int64

In [None]:
s.replace(to_replace=[40, 10], value=[42, 9.8])

0    42.0
1    20.0
2    30.0
3    20.0
4     9.8
dtype: float64

In [None]:
s.replace(to_replace={40: 42,
                      10: 9.8})

0    42.0
1    20.0
2    30.0
3    20.0
4     9.8
dtype: float64

In [None]:
# example 2
s = pd.Series(["Dave", "Suzy", "Adam", "Livz"])

In [None]:
s.replace(to_replace="Suzy", value="Suzanne")

0       Dave
1    Suzanne
2       Adam
3       Livz
dtype: object

In [None]:
s.replace(to_replace={'Suzy': 'Suzanne'})

0       Dave
1    Suzanne
2       Adam
3       Livz
dtype: object

In [None]:
s.replace(to_replace='z.*', value='zanne', regex=True)

0        Dave
1     Suzanne
2        Adam
3    Livzanne
dtype: object

## 9.12 Binning Data

- Create bins of equal width

In [None]:
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

In [None]:
# specific sizes for bin edges
pd.cut(city_mpg, [0, 10, 20, 40, 70, 150])

0        (10, 20]
1         (0, 10]
2        (20, 40]
3         (0, 10]
4        (10, 20]
           ...   
41139    (10, 20]
41140    (10, 20]
41141    (10, 20]
41142    (10, 20]
41143    (10, 20]
Name: city08, Length: 41144, dtype: category
Categories (5, interval[int64, right]): [(0, 10] < (10, 20] < (20, 40] < (40, 70] < (70, 150]]

- Bins do not start with start value but do include the end value.
- We can bin data with quantiles instead
- If we want 10 bins that had approximately the same number of entires in each bin (rather than each bin width being same), we can use ``qcut``

In [None]:
pd.qcut(city_mpg, 10)

0         (18.0, 20.0]
1        (5.999, 13.0]
2         (21.0, 24.0]
3        (5.999, 13.0]
4         (16.0, 17.0]
             ...      
41139     (18.0, 20.0]
41140     (18.0, 20.0]
41141     (17.0, 18.0]
41142     (17.0, 18.0]
41143     (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] < (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] < (21.0, 24.0] < (24.0, 150.0]]

In [None]:
pd.qcut(city_mpg, 10, labels=list(range(1,11)))

0        7
1        1
2        9
3        1
4        5
        ..
41139    7
41140    7
41141    6
41142    6
41143    4
Name: city08, Length: 41144, dtype: category
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]