# Objective : 12. Data Transformation using Map, Apply & GroupBy
<hr>

1. Transforming Series using Map
2. Transforming across multiple Series using apply
3. GroupBy - Splitting, Applying & Combine

<hr>

In [1]:
import pandas as pd
import numpy as np
hr_data = pd.read_csv('../Data/HR_comma_sep.csv.txt')
hr_data.rename(columns={'sales':'department'}, inplace=True)

### 1. Transforming Series using Map

In [3]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


* map for transforming left column with some categorical information

In [5]:
hr_data['left_categorical'] = hr_data.left.map({1:'True',0:'False'})

In [6]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,left_categorical
0,0.38,0.53,2,157,3,0,1,0,sales,low,True
1,0.8,0.86,5,262,6,0,1,0,sales,medium,True
2,0.11,0.88,7,272,4,0,1,0,sales,medium,True
3,0.72,0.87,5,223,5,0,1,0,sales,low,True
4,0.37,0.52,2,159,3,0,1,0,sales,low,True


### 2. Transforming data across multiple Series
1. If satisfaction_level > .9, increase number_project by 1
2. Multiple columns can't be dealt with map, we need apply for that

In [8]:
def increase_proj(r):
    if r.satisfaction_level > .9:
        return r.number_project + 1
    else:
        return r.number_project

hr_data['new_number_project'] = hr_data.apply(increase_proj, axis=1)

* Filtering all the folks for which this happened

In [10]:
hr_data[hr_data.number_project != hr_data.new_number_project].head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,left_categorical,new_number_project
7,0.92,0.85,5,259,5,0,1,0,sales,low,True,6
106,0.91,1.0,4,257,5,0,1,0,accounting,medium,True,5
191,0.92,0.87,4,226,6,1,1,0,technical,medium,True,5
231,0.92,0.99,5,255,6,0,1,0,sales,low,True,6
352,0.91,0.91,4,262,6,0,1,0,support,low,True,5


### 3. GroupBy

In [17]:
grouped = hr_data.groupby(['department'])

* Compute first & last of group values

In [12]:
grouped.first()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IT,0.11,0.93,7,308,4,0,1,0,medium
RandD,0.12,1.0,3,278,4,0,1,0,medium
accounting,0.41,0.46,2,128,3,0,1,0,low
hr,0.45,0.57,2,134,3,0,1,0,low
management,0.85,0.91,5,226,5,0,1,0,medium
marketing,0.4,0.54,2,137,3,0,1,0,medium
product_mng,0.43,0.54,2,153,3,0,1,0,medium
sales,0.38,0.53,2,157,3,0,1,0,low
support,0.4,0.55,2,147,3,0,1,0,low
technical,0.1,0.94,6,255,4,0,1,0,low


In [13]:
grouped.last()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IT,0.9,0.92,4,271,5,0,1,0,medium
RandD,0.81,0.92,5,239,5,0,1,0,medium
accounting,0.36,0.54,2,153,3,0,1,0,medium
hr,0.4,0.47,2,144,3,0,1,0,medium
management,0.42,0.57,2,147,3,1,1,0,low
marketing,0.44,0.52,2,149,3,0,1,0,low
product_mng,0.46,0.55,2,147,3,0,1,0,medium
sales,0.39,0.45,2,140,3,0,1,0,medium
support,0.37,0.52,2,158,3,0,1,0,low
technical,0.43,0.57,2,159,3,1,1,0,low


In [18]:
grouped.nth(2)

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IT,0.36,0.56,2,132,3,0,1,0,medium
RandD,0.37,0.55,2,127,3,0,1,0,medium
accounting,0.09,0.62,6,294,4,0,1,0,low
hr,0.45,0.55,2,140,3,0,1,0,low
management,0.42,0.48,2,129,3,0,1,0,low
marketing,0.11,0.77,6,291,4,0,1,0,low
product_mng,0.76,0.86,5,223,5,1,1,0,medium
sales,0.11,0.88,7,272,4,0,1,0,medium
support,0.4,0.54,2,148,3,0,1,0,low
technical,0.45,0.5,2,126,3,0,1,0,low


In [15]:
grouped.groups

{'IT': Int64Index([   61,    62,    63,    64,    65,    70,   138,   139,   140,
               141,
             ...
             14808, 14809, 14810, 14815, 14929, 14930, 14931, 14932, 14933,
             14938],
            dtype='int64', length=1227),
 'RandD': Int64Index([  301,   302,   303,   304,   305,   453,   454,   455,   456,
               457,
             ...
             14816, 14817, 14818, 14819, 14820, 14939, 14940, 14941, 14942,
             14943],
            dtype='int64', length=787),
 'accounting': Int64Index([   28,    29,    30,    79,   105,   106,   107,   155,   181,
               182,
             ...
             14849, 14850, 14851, 14896, 14897, 14898, 14946, 14972, 14973,
             14974],
            dtype='int64', length=767),
 'hr': Int64Index([   31,    32,    33,    34,   108,   109,   110,   111,   184,
               185,
             ...
             14854, 14855, 14899, 14900, 14901, 14902, 14975, 14976, 14977,
             14978],
    

In [18]:
hr_data.groupby(['department','salary']).groups

{('IT',
  'high'): Int64Index([ 1281,  1359,  1437,  1515,  3192,  3193,  3194,  3195,  3200,
              3270,  3504,  3799,  3802,  4260,  4264,  4269,  4720,  5097,
              5098,  5557,  5558,  5559,  5560,  5561,  5634,  5712,  5790,
              5865,  5943,  6024,  6093,  6547,  6550,  7009,  7011,  7012,
              7087,  7474,  7544,  7845,  7846,  7847,  7848,  7849,  7998,
              8076,  8154,  8229,  8307,  8308,  8309,  8314,  8385,  8772,
              8917,  8919,  9375,  9835, 10213, 10593, 10671, 10672, 10673,
             10678, 10747, 10749, 10980, 11268, 11601, 11700, 11707, 12804,
             12882, 12883, 12884, 12889, 12958, 12960, 13191, 13479, 13812,
             13911, 13918],
            dtype='int64'),
 ('IT',
  'low'): Int64Index([  138,   139,   140,   141,   142,   147,   214,   215,   216,
               217,
             ...
             14731, 14732, 14733, 14734, 14806, 14807, 14808, 14809, 14810,
             14815],
            dty

* Selecting a group

In [24]:
grouped = hr_data.groupby(['department','salary'])

In [27]:
grouped.get_group(('technical','low')).head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
35,0.1,0.94,6,255,4,0,1,0,technical,low
36,0.38,0.46,2,137,3,0,1,0,technical,low
37,0.45,0.5,2,126,3,0,1,0,technical,low
38,0.11,0.89,6,306,4,0,1,0,technical,low
39,0.41,0.54,2,152,3,0,1,0,technical,low


### Aggregation 
* Once the GroupBy object has been created, several methods are available to perform a computation on the grouped data. 
* These operations are similar to the aggregating API, window functions API, and resample API.

In [28]:
grouped.number_project.aggregate(np.mean)

department   salary
IT           high      3.867470
             low       3.794745
             medium    3.833645
RandD        high      3.764706
             low       3.804945
             medium    3.913978
accounting   high      3.905405
             low       3.801676
             medium    3.832836
hr           high      3.888889
             low       3.692537
             medium    3.590529
management   high      3.777778
             low       3.777778
             medium    4.008889
marketing    high      3.425000
             low       3.751244
             medium    3.675532
product_mng  high      3.705882
             low       3.824834
             medium    3.804178
sales        high      3.858736
             low       3.757980
             medium    3.785553
support      high      3.794326
             low       3.787086
             medium    3.825902
technical    high      3.651741
             low       3.910350
             medium    3.878814
Name: number_project

In [36]:
grouped.agg([np.mean,np.max])

Unnamed: 0_level_0,Unnamed: 1_level_0,satisfaction_level,satisfaction_level,last_evaluation,last_evaluation,number_project,number_project,average_montly_hours,average_montly_hours,time_spend_company,time_spend_company,Work_accident,Work_accident,left,left,promotion_last_5years,promotion_last_5years
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,amax,mean,amax,mean,amax,mean,amax,mean,amax,mean,amax,mean,amax,mean,amax
department,salary,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
IT,high,0.638193,0.99,0.716627,0.99,3.86747,6,194.927711,275,3.072289,6,0.048193,1,0.048193,1,0.0,0
IT,low,0.610099,1.0,0.715665,1.0,3.794745,7,201.382594,308,3.438424,10,0.146141,1,0.28243,1,0.003284,1
IT,medium,0.624187,1.0,0.718187,1.0,3.833645,7,204.295327,308,3.564486,10,0.13271,1,0.181308,1,0.001869,1
RandD,high,0.586667,0.97,0.700588,0.95,3.764706,6,199.745098,287,3.529412,8,0.176471,1,0.078431,1,0.019608,1
RandD,low,0.623929,1.0,0.714176,1.0,3.804945,7,198.747253,308,3.381868,8,0.195055,1,0.151099,1,0.008242,1
RandD,medium,0.620349,1.0,0.711694,1.0,3.913978,7,202.954301,301,3.330645,6,0.145161,1,0.166667,1,0.061828,1
accounting,high,0.614054,0.97,0.724595,1.0,3.905405,6,205.905405,277,3.216216,8,0.202703,1,0.067568,1,0.081081,1
accounting,low,0.574162,1.0,0.713883,1.0,3.801676,7,199.899441,308,3.438547,10,0.111732,1,0.276536,1,0.005587,1
accounting,medium,0.583642,1.0,0.720299,1.0,3.832836,7,201.465672,310,3.680597,10,0.122388,1,0.298507,1,0.01791,1
hr,high,0.673111,0.99,0.743778,0.99,3.888889,6,209.066667,289,2.911111,6,0.088889,1,0.133333,1,0.044444,1


### Descriptive statistics of grouped data

In [30]:
grouped.size()

department   salary
IT           high        83
             low        609
             medium     535
RandD        high        51
             low        364
             medium     372
accounting   high        74
             low        358
             medium     335
hr           high        45
             low        335
             medium     359
management   high       225
             low        180
             medium     225
marketing    high        80
             low        402
             medium     376
product_mng  high        68
             low        451
             medium     383
sales        high       269
             low       2099
             medium    1772
support      high       141
             low       1146
             medium     942
technical    high       201
             low       1372
             medium    1147
dtype: int64

In [31]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,satisfaction_level,satisfaction_level,satisfaction_level,satisfaction_level,satisfaction_level,satisfaction_level,satisfaction_level,satisfaction_level,last_evaluation,last_evaluation,...,left,left,promotion_last_5years,promotion_last_5years,promotion_last_5years,promotion_last_5years,promotion_last_5years,promotion_last_5years,promotion_last_5years,promotion_last_5years
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
department,salary,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
IT,high,83.0,0.638193,0.223749,0.15,0.525,0.65,0.78,0.99,83.0,0.716627,...,0.0,1.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IT,low,609.0,0.610099,0.258915,0.09,0.41,0.65,0.82,1.0,609.0,0.715665,...,1.0,1.0,609.0,0.003284,0.05726,0.0,0.0,0.0,0.0,1.0
IT,medium,535.0,0.624187,0.243297,0.09,0.49,0.66,0.81,1.0,535.0,0.718187,...,0.0,1.0,535.0,0.001869,0.043234,0.0,0.0,0.0,0.0,1.0
RandD,high,51.0,0.586667,0.228785,0.1,0.44,0.6,0.745,0.97,51.0,0.700588,...,0.0,1.0,51.0,0.019608,0.140028,0.0,0.0,0.0,0.0,1.0
RandD,low,364.0,0.623929,0.242586,0.09,0.47,0.675,0.82,1.0,364.0,0.714176,...,0.0,1.0,364.0,0.008242,0.090534,0.0,0.0,0.0,0.0,1.0
RandD,medium,372.0,0.620349,0.250293,0.09,0.4775,0.65,0.83,1.0,372.0,0.711694,...,0.0,1.0,372.0,0.061828,0.241167,0.0,0.0,0.0,0.0,1.0
accounting,high,74.0,0.614054,0.237319,0.11,0.5,0.62,0.83,0.97,74.0,0.724595,...,0.0,1.0,74.0,0.081081,0.274823,0.0,0.0,0.0,0.0,1.0
accounting,low,358.0,0.574162,0.25225,0.09,0.4,0.59,0.78,1.0,358.0,0.713883,...,1.0,1.0,358.0,0.005587,0.074639,0.0,0.0,0.0,0.0,1.0
accounting,medium,335.0,0.583642,0.262273,0.09,0.4,0.63,0.8,1.0,335.0,0.720299,...,1.0,1.0,335.0,0.01791,0.132824,0.0,0.0,0.0,0.0,1.0
hr,high,45.0,0.673111,0.250616,0.09,0.55,0.73,0.86,0.99,45.0,0.743778,...,0.0,1.0,45.0,0.044444,0.208409,0.0,0.0,0.0,0.0,1.0


In [4]:
grouped = hr_data.groupby(['department'])

In [40]:
grouped.agg(mean_projects=('number_project','mean'), mean_satisfaction=('satisfaction_level','mean'))

Unnamed: 0_level_0,mean_projects,mean_satisfaction
department,Unnamed: 1_level_1,Unnamed: 2_level_1
IT,3.816626,0.618142
RandD,3.853875,0.619822
accounting,3.825293,0.582151
hr,3.654939,0.598809
management,3.860317,0.621349
marketing,3.687646,0.618601
product_mng,3.807095,0.619634
sales,3.776329,0.614447
support,3.803948,0.6183
technical,3.877941,0.607897


In [16]:
grouped.transform(lambda x : x + 2 )

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
0,2.38,2.53,4,159,5,2,3,2
1,2.80,2.86,7,264,8,2,3,2
2,2.11,2.88,9,274,6,2,3,2
3,2.72,2.87,7,225,7,2,3,2
4,2.37,2.52,4,161,5,2,3,2
5,2.41,2.50,4,155,5,2,3,2
6,2.10,2.77,8,249,6,2,3,2
7,2.92,2.85,7,261,7,2,3,2
8,2.89,3.00,7,226,7,2,3,2
9,2.42,2.53,4,144,5,2,3,2
