## Dataframe column transformations

* change type of a column - int to str
* change columns to category
* create new column by n characters from another column
* combine two columns into another column


In [1]:
import pandas as pd
df = pd.DataFrame(
    {
        'parent_id': [3111, 2010, 3000, 1000, 4023, 3011, 3033, 5010, 3011, 3102, 2010, 4023, 2110, 2100, 1000, 5010, 2110, 1000, 5010, 3033],
        'child_id': [4321, 3102, 4023, 2010, 5321, 4200, 4113, 6525, 4010, 4001, 3011, 5010, 3000, 3033, 2110, 6100, 3111, 2100, 6016, 4311]
    }
)

df.head()

Unnamed: 0,parent_id,child_id
0,3111,4321
1,2010,3102
2,3000,4023
3,1000,2010
4,4023,5321


In [2]:
df.dtypes

parent_id    int64
child_id     int64
dtype: object

## 1. change type of a column
* int to str
* str to int

In [3]:
df.parent_id = df.parent_id.astype('str')

In [4]:
df.dtypes

parent_id    object
child_id      int64
dtype: object

In [5]:
df.parent_id = df.parent_id.astype('int')

In [6]:
df.dtypes

parent_id    int64
child_id     int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,parent_id,child_id
count,20.0,20.0
mean,2885.85,3971.4
std,1263.501787,1327.918014
min,1000.0,2010.0
25%,2077.5,3027.5
50%,3011.0,4016.5
75%,3339.0,4493.25
max,5010.0,6525.0


## 2. convert column to a category

Two reasons for that
* performance - having small number of distinct values (lots of repetition  in single column)
* sort - when the lexical order of a variable is not the same as the logical order 

In [8]:
df.parent_id = df.parent_id.astype('category')

In [9]:
df.dtypes

parent_id    category
child_id        int64
dtype: object

In [10]:
df.describe()

Unnamed: 0,child_id
count,20.0
mean,3971.4
std,1327.918014
min,2010.0
25%,3027.5
50%,4016.5
75%,4493.25
max,6525.0


In [11]:
df.child_id = df.child_id.astype('category')

In [12]:
df['child_id'].cat.categories

Int64Index([2010, 2100, 2110, 3000, 3011, 3033, 3102, 3111, 4001, 4010, 4023,
            4113, 4200, 4311, 4321, 5010, 5321, 6016, 6100, 6525],
           dtype='int64')

In [13]:
df['parent_id'].cat.categories

Int64Index([1000, 2010, 2100, 2110, 3000, 3011, 3033, 3102, 3111, 4023, 5010], dtype='int64')

In [None]:
for column in ['parent_id', 'child_id']:
    df[col] = df[column].astype('int')

## 3. create new column by n characters from another column

* get last n characters
* get first n characters
* get n characters from the middle

In [14]:
df['parent_id_last'] = df.apply(lambda row: str(row['parent_id'])[-3:], axis=1)

In [15]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last
0,3111,4321,111
1,2010,3102,10
2,3000,4023,0
3,1000,2010,0
4,4023,5321,23


In [16]:
df['parent_id_first'] = df.apply(lambda row: str(row['parent_id'])[:2], axis=1)

In [17]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last,parent_id_first
0,3111,4321,111,31
1,2010,3102,10,20
2,3000,4023,0,30
3,1000,2010,0,10
4,4023,5321,23,40


In [19]:
df['parent_id_middle'] = df.apply(lambda row: str(row['child_id'])[1:3], axis=1)

In [20]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last,parent_id_first,parent_id_middle
0,3111,4321,111,31,32
1,2010,3102,10,20,10
2,3000,4023,0,30,2
3,1000,2010,0,10,1
4,4023,5321,23,40,32


## 4. combine two columns into another column

In [21]:
df['combined'] = df.apply(lambda row: str(row['parent_id']) + str(row['child_id']), axis=1)

In [22]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last,parent_id_first,parent_id_middle,combined
0,3111,4321,111,31,32,31114321
1,2010,3102,10,20,10,20103102
2,3000,4023,0,30,2,30004023
3,1000,2010,0,10,1,10002010
4,4023,5321,23,40,32,40235321


In [23]:
df['combined'] = df.apply(lambda row: str(row['parent_id'] + row['child_id']), axis=1)

In [24]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last,parent_id_first,parent_id_middle,combined
0,3111,4321,111,31,32,7432
1,2010,3102,10,20,10,5112
2,3000,4023,0,30,2,7023
3,1000,2010,0,10,1,3010
4,4023,5321,23,40,32,9344


In [25]:
def comb(x, y):
    return str(x) + str(y)

df['comb'] = df.apply(lambda row: comb(row['parent_id'], row['child_id']), axis=1)

In [26]:
df.head()

Unnamed: 0,parent_id,child_id,parent_id_last,parent_id_first,parent_id_middle,combined,comb
0,3111,4321,111,31,32,7432,31114321
1,2010,3102,10,20,10,5112,20103102
2,3000,4023,0,30,2,7023,30004023
3,1000,2010,0,10,1,3010,10002010
4,4023,5321,23,40,32,9344,40235321
