# Dummies in Pandas

Given a categorical column, I want to get a data frame in which each column is named for one of the categorical values, and the values in each of those resulting columns will be `True` or `False`.

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
s = Series(list('abcbcbcb'))
s

0    a
1    b
2    c
3    b
4    c
5    b
6    c
7    b
dtype: object

In [3]:
# I want a data frame whose columns are a, b, and c, where I can find out (with True/False values) whether
# for a given index we had a value of a, b, or c.

pd.get_dummies(s)

Unnamed: 0,a,b,c
0,True,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,False,False,True
5,False,True,False
6,False,False,True
7,False,True,False


In [4]:
df = pd.read_csv('taxi.csv')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [5]:
df['payment_type']

0       2
1       1
2       1
3       1
4       2
       ..
9994    2
9995    1
9996    2
9997    2
9998    1
Name: payment_type, Length: 9999, dtype: int64

In [6]:
df['payment_type'].value_counts()

payment_type
1    5968
2    3991
3      34
4       6
Name: count, dtype: int64

In [7]:
# 1 -- credit card
# 2 -- cash
# 3 -- no charge
# 4 -- refund

In [8]:
pd.get_dummies(df['payment_type'])

Unnamed: 0,1,2,3,4
0,False,True,False,False
1,True,False,False,False
2,True,False,False,False
3,True,False,False,False
4,False,True,False,False
...,...,...,...,...
9994,False,True,False,False
9995,True,False,False,False
9996,False,True,False,False
9997,False,True,False,False


In [11]:
pd.concat([df,
           pd.get_dummies(df['payment_type'])], axis='columns')

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,1,2,3,4
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,...,0.0,0.5,0.00,0.0,0.3,17.80,False,True,False,False
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,...,0.0,0.5,1.00,0.0,0.3,8.30,True,False,False,False
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,...,0.0,0.5,2.20,0.0,0.3,11.00,True,False,False,False
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,...,0.0,0.5,2.86,0.0,0.3,17.16,True,False,False,False
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,...,0.0,0.5,0.00,0.0,0.3,10.30,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,...,0.5,0.5,0.00,0.0,0.3,12.30,False,True,False,False
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,...,0.5,0.5,3.00,0.0,0.3,20.30,True,False,False,False
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,...,0.5,0.5,0.00,0.0,0.3,22.30,False,True,False,False
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,...,0.5,0.5,0.00,0.0,0.3,7.80,False,True,False,False


In [17]:
pd.concat([df,
           pd.get_dummies(df['payment_type'], prefix='payment_type', prefix_sep='_')],
        axis='columns')

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type_1,payment_type_2,payment_type_3,payment_type_4
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,...,0.0,0.5,0.00,0.0,0.3,17.80,False,True,False,False
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,...,0.0,0.5,1.00,0.0,0.3,8.30,True,False,False,False
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,...,0.0,0.5,2.20,0.0,0.3,11.00,True,False,False,False
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,...,0.0,0.5,2.86,0.0,0.3,17.16,True,False,False,False
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,...,0.0,0.5,0.00,0.0,0.3,10.30,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,...,0.5,0.5,0.00,0.0,0.3,12.30,False,True,False,False
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,...,0.5,0.5,3.00,0.0,0.3,20.30,True,False,False,False
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,...,0.5,0.5,0.00,0.0,0.3,22.30,False,True,False,False
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,...,0.5,0.5,0.00,0.0,0.3,7.80,False,True,False,False


In [20]:
df = pd.concat([df,
           pd.get_dummies(df['payment_type'])],
        axis='columns')

In [21]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,1,2,3,4
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,...,0.0,0.5,0.0,0.0,0.3,17.8,False,True,False,False
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,...,0.0,0.5,1.0,0.0,0.3,8.3,True,False,False,False
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,...,0.0,0.5,2.2,0.0,0.3,11.0,True,False,False,False
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,...,0.0,0.5,2.86,0.0,0.3,17.16,True,False,False,False
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,...,0.0,0.5,0.0,0.0,0.3,10.3,False,True,False,False


In [22]:
# what if I want to do the *opposite*?

pd.from_dummies(df[[1,2,3,4]])

Unnamed: 0,Unnamed: 1
0,2
1,1
2,1
3,1
4,2
...,...
9994,2
9995,1
9996,2
9997,2
