In [1]:
import pandas as pd
import numpy as np

## Remove dplicates

In [6]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                      "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [7]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [8]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


### check duplicates on a specific column

In [10]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [11]:
data.duplicated(subset=["k1"])

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [12]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


### keep the last one instead of the first one

In [13]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## Transforming Data Using a Function or Mapping

In [14]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                                 "pastrami", "corned beef", "bacon",
                                  "pastrami", "honey ham", "nova lox"],
                         "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [15]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [16]:
data["animal"] = data['food'].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [17]:
def get_animal(x):
    return meat_to_animal[x]
    

In [18]:
data["animal"] = data['food'].map(get_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### Replacing Values

In [19]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [20]:
data.replace(-999, 200)

0       1.0
1     200.0
2       2.0
3     200.0
4   -1000.0
5       3.0
dtype: float64

In [23]:
data.replace(-999 , np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [24]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [25]:
 data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [27]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                       index=["Ohio", "Colorado", "New York"],
                       columns=["one", "two", "three", "four"])
def transform(x):
   return x[:4].upper()

In [29]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [30]:
 data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


### Discretization and Binning

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
pd.cut(ages, bins)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

### give it a label

In [4]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages , bins , labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

### give number of bins

In [5]:
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.45, 0.67], (0.22, 0.45], (0.67, 0.89], (0.22, 0.45], (0.22, 0.45], ..., (0.67, 0.89], (0.22, 0.45], (-0.00027, 0.22], (-0.00027, 0.22], (0.45, 0.67]]
Length: 20
Categories (4, interval[float64, right]): [(-0.00027, 0.22] < (0.22, 0.45] < (0.45, 0.67] < (0.67, 0.89]]

### quartiles  qcut

In [7]:
data = np.random.standard_normal(50)
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(-2.8699999999999997, -0.29], (-2.8699999999999997, -0.29], (0.41, 0.74], (-2.8699999999999997, -0.29], (0.41, 0.74], ..., (0.41, 0.74], (0.41, 0.74], (-0.29, 0.41], (-0.29, 0.41], (0.74, 3.13]]
Length: 50
Categories (4, interval[float64, right]): [(-2.8699999999999997, -0.29] < (-0.29, 0.41] < (0.41, 0.74] < (0.74, 3.13]]

In [10]:
 pd.Series(quartiles).value_counts()

(-2.8699999999999997, -0.29]    13
(0.74, 3.13]                    13
(-0.29, 0.41]                   12
(0.41, 0.74]                    12
Name: count, dtype: int64

In [11]:
 pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-2.8609999999999998, -0.733]     5
(-0.733, 0.407]                  20
(0.407, 1.265]                   20
(1.265, 3.125]                    5
Name: count, dtype: int64

### Detecting and Filtering Outliers

In [12]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.032639,-0.030564,0.043867,-0.035133
std,0.984288,1.002682,0.967754,1.040314
min,-2.568208,-3.358193,-3.292675,-3.053239
25%,-0.653998,-0.692705,-0.635994,-0.711245
50%,0.074729,-0.05824,0.005708,-0.067432
75%,0.739909,0.627498,0.726701,0.655288
max,3.075438,3.650662,2.747222,3.345053


#### To select all rows having a value exceeding 3 or –3, you can use the any method on a Boolean DataFrame:

In [13]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
208,0.921254,3.650662,-3.292675,-1.230424
251,0.62778,-3.358193,-0.178761,1.665518
313,0.660253,-0.530416,0.707482,3.151251
518,3.075438,0.000413,-0.610002,-0.681865
747,-1.51767,2.152462,0.578336,3.086494
888,0.872306,0.811687,0.397043,-3.053239
932,0.485397,-1.250501,1.199538,3.345053


#### Then it can be winsorized like below

In [15]:
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.032563,-0.030857,0.04416,-0.035663
std,0.984057,0.99937,0.966787,1.038385
min,-2.568208,-3.0,-3.0,-3.0
25%,-0.653998,-0.692705,-0.635994,-0.711245
50%,0.074729,-0.05824,0.005708,-0.067432
75%,0.739909,0.627498,0.726701,0.655288
max,3.0,3.0,2.747222,3.0


### Permutation and Random Sampling

In [16]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [25]:
sampler = np.random.permutation(5)
sampler

array([4, 2, 1, 3, 0])

In [29]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6


In [31]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6


In [35]:
column_sampler = np.random.permutation(7)
df.take(column_sampler , axis='columns')

Unnamed: 0,2,6,5,3,4,0,1
0,2,6,5,3,4,0,1
1,9,13,12,10,11,7,8
2,16,20,19,17,18,14,15
3,23,27,26,24,25,21,22
4,30,34,33,31,32,28,29


### Computing Indicator/Dummy Variables

#### Dummies

In [39]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                    "data1": range(6)})
pd.get_dummies(df['key'] , prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [40]:
df_with_dummy = df[["data1"]].join(pd.get_dummies(df['key'] , prefix='key'))
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


#### If a row in a DataFrame belongs to multiple categories, we have to use a different approach to create the dummy variables

In [44]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("https://raw.githubusercontent.com/wesm/pydata-book/3rd-edition/datasets/movielens/movies.dat", sep="::",
                     header=None, names=mnames, engine="python")
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [46]:
dummies = movies["genres"].str.get_dummies("|")
dummies.iloc[:10, :6]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [47]:
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
