# DATA TRANSFORMATION
## REMOVING DUPLICATES

In [1]:
import pandas as pd 
import numpy as np 
from pandas import Series, DataFrame


In [2]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


## TRANSFORMING DATA USING FUNCTIONS or MAPPING

In [5]:
data = DataFrame({'food': ['a','b','c','d'], 
                  'weight': [2,4,6,8]})
data

Unnamed: 0,food,weight
0,a,2
1,b,4
2,c,6
3,d,8


In [6]:
food_to_code = {'a': '001A', 'b': '001b', 'c': '001C', 'd': '001d'}
data['code'] = data['food'].map(food_to_code)
data

Unnamed: 0,food,weight,code
0,a,2,001A
1,b,4,001b
2,c,6,001C
3,d,8,001d


In [7]:
data['food'].map(lambda x: food_to_code[x])

0    001A
1    001b
2    001C
3    001d
Name: food, dtype: object

## REPLACE VALUES

In [8]:
data = Series([1., -999., 999., 2.])
data.replace([-999,999], np.nan)

0    1.0
1    NaN
2    NaN
3    2.0
dtype: float64

## DISCRETIONIZATION and BINNING

In [9]:
ages = [20,22,25,27,21,23,27,37,31,62,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [10]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     4
(35, 60]     3
(60, 100]    1
dtype: int64

In [11]:
groups = ['a','b','c','d']
pd.cut(ages, bins, labels=groups)

[a, a, a, b, a, ..., b, d, c, c, b]
Length: 13
Categories (4, object): [a < b < c < d]

In [12]:
#pd.cut?

In [13]:
data = np.random.randn(10)

In [14]:
data1 = pd.cut(data, 5, precision=2)
pd.value_counts(data1)

(-0.4, 0.37]      4
(-1.93, -1.16]    3
(1.13, 1.9]       1
(0.37, 1.13]      1
(-1.16, -0.4]     1
dtype: int64

In [15]:
data2 = pd.qcut(data, 5, precision=2)
pd.value_counts(data2)


(0.44, 1.9]       2
(0.026, 0.44]     2
(-0.47, 0.026]    2
(-1.49, -0.47]    2
(-1.94, -1.49]    2
dtype: int64

## OUTLIERS

In [16]:
np.random.seed(12345) # next line generates the same random numbers
data = DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [17]:
data.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [18]:
col = data[3]
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [19]:
# show any index with values > abs(3)
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [20]:
data[np.abs(data) > 3] = np.sign(data)
np.sign(data)

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,1.0,1.0,1.0,1.0
4,-1.0,-1.0,1.0,-1.0
...,...,...,...,...
995,1.0,1.0,-1.0,1.0
996,-1.0,-1.0,-1.0,-1.0
997,-1.0,1.0,1.0,1.0
998,-1.0,-1.0,-1.0,-1.0


In [21]:
# dataframe cleared of values > abs(3)
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.070473,0.023153,-8.1e-05
std,0.987408,0.986061,0.983883,0.977526
min,-2.969411,-2.989741,-2.925113,-2.881858
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,2.666744,2.653656,2.954439,2.735527
