# UNIT5: Advanced Pandas


In [2]:
import pandas as pd
import numpy as np
import scipy.stats
import seaborn as sns

# Categorical Data: Data of string type but with categories of values like Low/high/Medium

* First/Second/Third/Fail : Class ranking
## Categorical values being string type consumes much memory. Hence,Python provides various mechanism to improve performance of the code in case categorical values are large by optimizing usage of memory/ significant performance improvements

In [63]:
values = pd.Series(['First', 'Second', 'First',
                    'Second','Third'] * 2)
print(values)

0     First
1    Second
2     First
3    Second
4     Third
5     First
6    Second
7     First
8    Second
9     Third
dtype: object


In [147]:
pd.unique(values)

array([0, 2, 1], dtype=int64)

In [148]:
pd.value_counts(values)

0    4
2    4
1    2
dtype: int64

In [156]:
values = pd.Series([1, 2, 0, 1,2] * 2)
grade = pd.Series(['First', 'Second','Third'])
print(values)
len(grade)

0    1
1    2
2    0
3    1
4    2
5    1
6    2
7    0
8    1
9    2
dtype: int64


3

## Many data systems (for data warehousing, statistical computing, or model building for recommendations) have developed specialized approaches for representing data with repeated values for more efficient storage and computation. 
* In data warehousing, a best practice is to use socalled dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table

### take(): to restore to original series of values but with encoded values of categories
* the categorical or dictionary-encoded representation: encoding using integer representation/values Categories/dictionary/levels : using array of distinct values 

### can also perform transformations on the categories while leaving the codes unmodified. 
### Some example transformations that can be made at relatively low cost are:
*  Renaming categories
* Appending a new category without changing the order or position of the existing categories

In [157]:
grade.take(values)

1    Second
2     Third
0     First
1    Second
2     Third
1    Second
2     Third
0     First
1    Second
2     Third
dtype: object

## Example of categorical type in Pandas

In [153]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [158]:
TitanicDF=sns.load_dataset("titanic")
TitanicDF.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

## Description of all coulmns

In [159]:
TitanicDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [160]:
TitanicDF.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [161]:
TitanicDF.describe(include='all')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891,891,891,203,889,891,891
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,577,,,,,644,491,537,537,59,644,549,537
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,,,,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,,,,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,,,,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,,,,,,


## Taking few categorical attributes

In [162]:
Titanicnew=TitanicDF[['deck','embarked','class','embark_town']].copy()
Titanicnew

Unnamed: 0,deck,embarked,class,embark_town
0,,S,Third,Southampton
1,C,C,First,Cherbourg
2,,S,Third,Southampton
3,C,S,First,Southampton
4,,S,Third,Southampton
...,...,...,...,...
886,,S,Second,Southampton
887,B,S,First,Southampton
888,,S,Third,Southampton
889,C,C,First,Cherbourg


In [165]:
Titanicnew.embark_town.unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [166]:
Titanicnew.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

## Two ways to convert a column to categorical type
1. Explicit store catgory in a variable and then make a new column
2. directly as a categorical type

In [167]:
embarked_cat = TitanicDF['embark_town'].astype('category')
Titanicnew['Embarked_cat']=embarked_cat
Titanicnew

Unnamed: 0,deck,embarked,class,embark_town,Embarked_cat
0,,S,Third,Southampton,Southampton
1,C,C,First,Cherbourg,Cherbourg
2,,S,Third,Southampton,Southampton
3,C,S,First,Southampton,Southampton
4,,S,Third,Southampton,Southampton
...,...,...,...,...,...
886,,S,Second,Southampton,Southampton
887,B,S,First,Southampton,Southampton
888,,S,Third,Southampton,Southampton
889,C,C,First,Cherbourg,Cherbourg


In [168]:
Titanicnew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   deck          203 non-null    category
 1   embarked      889 non-null    object  
 2   class         891 non-null    category
 3   embark_town   889 non-null    object  
 4   Embarked_cat  889 non-null    category
dtypes: category(3), object(2)
memory usage: 17.3+ KB


In [169]:
len(embarked_cat)

891

In [80]:
TitanicDF.embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [170]:
categ1 = embarked_cat.values
type(categ1)

pandas.core.arrays.categorical.Categorical

In [171]:
categ1

['Southampton', 'Cherbourg', 'Southampton', 'Southampton', 'Southampton', ..., 'Southampton', 'Southampton', 'Southampton', 'Cherbourg', 'Queenstown']
Length: 891
Categories (3, object): ['Cherbourg', 'Queenstown', 'Southampton']

## Distinct values in the category object

In [172]:
categ1.categories

Index(['Cherbourg', 'Queenstown', 'Southampton'], dtype='object')

## Encoding of all values in the embarked column

In [173]:
np.unique(categ1.codes)

array([-1,  0,  1,  2], dtype=int8)

In [84]:
len(categ1.codes)

891

In [174]:
TitanicDF.embark_town.isnull().sum()

2

In [175]:
Titanicnew

Unnamed: 0,deck,embarked,class,embark_town,Embarked_cat
0,,S,Third,Southampton,Southampton
1,C,C,First,Cherbourg,Cherbourg
2,,S,Third,Southampton,Southampton
3,C,S,First,Southampton,Southampton
4,,S,Third,Southampton,Southampton
...,...,...,...,...,...
886,,S,Second,Southampton,Southampton
887,B,S,First,Southampton,Southampton
888,,S,Third,Southampton,Southampton
889,C,C,First,Cherbourg,Cherbourg


## Second way for assigning categorical column in a DF using categories of another DF

In [176]:
Titanicnew['Embarked_catnew'] = TitanicDF['embarked'].astype('category')

## create pandas.Categorical directly from other types of Python sequences say list

In [177]:
L=list(Titanicnew['embarked'])
new_categories = pd.Categorical(L)
new_categories

['S', 'C', 'S', 'S', 'S', ..., 'S', 'S', 'S', 'C', 'Q']
Length: 891
Categories (3, object): ['C', 'Q', 'S']

In [89]:
new_categories.categories

Index(['C', 'Q', 'S'], dtype='object')

## note that in categories, NaN is not considered but values corresponding to nan will be assigned -1 in codes. If these codes are used for generating a new category labels for another column, then -1 again results in NaN

In [178]:
new_categories.codes

array([ 2,  0,  2,  2,  2,  1,  2,  2,  2,  0,  2,  2,  2,  2,  2,  2,  1,
        2,  2,  0,  2,  2,  1,  2,  2,  2,  0,  2,  1,  2,  0,  0,  1,  2,
        0,  2,  0,  2,  2,  0,  2,  2,  0,  0,  1,  2,  1,  1,  0,  2,  2,
        2,  0,  2,  0,  2,  2,  0,  2,  2,  0, -1,  2,  2,  0,  0,  2,  2,
        2,  2,  2,  2,  2,  0,  2,  2,  2,  2,  2,  2,  2,  2,  1,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0,  0,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  1,  2,  0,  2,  2,  0,  2,  1,  2,  0,
        2,  2,  2,  0,  2,  2,  0,  1,  2,  0,  2,  0,  2,  2,  2,  2,  0,
        2,  2,  2,  0,  0,  2,  2,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  0,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  1,  2,  2,  0,  2,  2,  0,  2,  2,  2,  0,  2,  2,  2,  2,  1,
        2,  1,  2,  2,  2,  2,  2,  0,  0,  1,  2,  1,  2,  2,  2,  2,  0,
        2,  2,  2,  0,  1,  0,  2,  2,  2,  2,  1,  0,  2,  2,  0,  2,  2,
        2,  2,  2,  2,  2

## Suppose new categorical values are to be assigned to the column

In [179]:
categories = ['A','Z','X','Y']
codes = new_categories.codes
new_categories_1 = pd.Categorical.from_codes(codes, categories)
new_categories_1

['X', 'A', 'X', 'X', 'X', ..., 'X', 'X', 'X', 'A', 'Z']
Length: 891
Categories (4, object): ['A', 'Z', 'X', 'Y']

In [180]:
TitanicDF['new1']=new_categories_1

In [181]:
new_categories_1.unique()

['X', 'A', 'Z', NaN]
Categories (3, object): ['X', 'A', 'Z']

In [182]:
TitanicDF['new1'].isnull().sum()

2

In [183]:
TitanicDF['new1'].dtype

CategoricalDtype(categories=['A', 'Z', 'X', 'Y'], ordered=False)

## Ordinal attributes: When using from_codes or any of the other constructors, We can indicate that the categories have a meaningful ordering as (by default) unless explicitly specified, categorical conversions assume no specific ordering of the categories. E.g. First/second/third has a order

In [184]:
categories = ['Fail','Third','Second', 'First']
codes = [3, 2, 1, 0,0, 0, 1,3]
ordered_cat = pd.Categorical.from_codes(codes, categories,ordered=True)
pd.DataFrame(ordered_cat,columns=['new'])

Unnamed: 0,new
0,First
1,Second
2,Third
3,Fail
4,Fail
5,Fail
6,Third
7,First


In [105]:
ordered_cat.categories

Index(['Fail', 'Third', 'Second', 'First'], dtype='object')

In [106]:
ordered_cat

['First', 'Second', 'Third', 'Fail', 'Fail', 'Fail', 'Third', 'First']
Categories (4, object): ['Fail' < 'Third' < 'Second' < 'First']

# advanced Group by

## What is the following doing?

In [186]:
example1=TitanicDF[['embarked','fare','class']].copy()

In [187]:
example1['embarked']= example1['embarked'].fillna('C')

In [188]:
example1.isna().sum()

embarked    0
fare        0
class       0
dtype: int64

In [111]:
len(example1.fare)

891

In [112]:
fare1 = example1.groupby('embarked').fare
fare1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002E8E26AAD90>

In [189]:
fare1.mean()

embarked
C    60.189978
Q    13.276030
S    27.079812
Name: fare, dtype: float64

In [190]:
for i,v in fare1:
    print(i,v)

C 1      71.2833
9      30.0708
19      7.2250
26      7.2250
30     27.7208
        ...   
866    13.8583
874    24.0000
875     7.2250
879    83.1583
889    30.0000
Name: fare, Length: 170, dtype: float64
Q 5       8.4583
16     29.1250
22      8.0292
28      7.8792
32      7.7500
        ...   
790     7.7500
825     6.9500
828     7.7500
885    29.1250
890     7.7500
Name: fare, Length: 77, dtype: float64
S 0       7.2500
2       7.9250
3      53.1000
4       8.0500
6      51.8625
        ...   
883    10.5000
884     7.0500
886    13.0000
887    30.0000
888    23.4500
Name: fare, Length: 644, dtype: float64


In [191]:
fare1.value_counts().sum()

891

## replacing each value in a group by its aggregate value using transform()

In [192]:
fare1.transform(lambda x: x.mean()).head(10)

0    27.079812
1    60.189978
2    27.079812
3    27.079812
4    27.079812
5    13.276030
6    27.079812
7    27.079812
8    27.079812
9    60.189978
Name: fare, dtype: float64

## same output as above!!

In [193]:
fare1.transform('mean').head(10)

0    27.079812
1    60.189978
2    27.079812
3    27.079812
4    27.079812
5    13.276030
6    27.079812
7    27.079812
8    27.079812
9    60.189978
Name: fare, dtype: float64

In [117]:
fare1.transform(lambda x: x * 2)

0       14.5000
1      142.5666
2       15.8500
3      106.2000
4       16.1000
         ...   
886     26.0000
887     60.0000
888     46.9000
889     60.0000
890     15.5000
Name: fare, Length: 891, dtype: float64

## Computing ranks

In [194]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.random.randint(1,15,size=12)})
df

Unnamed: 0,key,value
0,a,7
1,b,5
2,c,6
3,a,1
4,b,11
5,c,5
6,a,1
7,b,10
8,c,13
9,a,14


In [195]:
g = df.groupby('key').value
g.mean()

key
a    5.75
b    8.00
c    8.00
Name: value, dtype: float64

In [196]:
g.std()

key
a    6.184658
b    2.943920
c    3.559026
Name: value, dtype: float64

In [197]:
df.value.sort_values()

3      1
6      1
1      5
5      5
2      6
10     6
0      7
11     8
7     10
4     11
8     13
9     14
Name: value, dtype: int32

In [198]:
df.value.rank()

0      7.0
1      3.5
2      5.5
3      1.5
4     10.0
5      3.5
6      1.5
7      9.0
8     11.0
9     12.0
10     5.5
11     8.0
Name: value, dtype: float64

## How to access elements in each group

In [199]:
for i,e in g:
    print(i,"\n",e)

a 
 0     7
3     1
6     1
9    14
Name: value, dtype: int32
b 
 1      5
4     11
7     10
10     6
Name: value, dtype: int32
c 
 2      6
5      5
8     13
11     8
Name: value, dtype: int32


## getting ranking of values within each group

In [200]:
g.apply(lambda x: x.rank())

0     3.0
1     1.0
2     2.0
3     1.5
4     4.0
5     1.0
6     1.5
7     3.0
8     4.0
9     4.0
10    2.0
11    3.0
Name: value, dtype: float64

In [201]:
g.apply(lambda x: x.rank(ascending=False))

0     2.0
1     4.0
2     3.0
3     3.5
4     1.0
5     4.0
6     3.5
7     2.0
8     1.0
9     1.0
10    3.0
11    2.0
Name: value, dtype: float64

## More example on transform. Normalizing values within each group

In [138]:
def normalize(x):
    return (x - x.mean()) / x.std()

In [202]:
g.transform(normalize)


0     0.202113
1    -1.019049
2    -0.561951
3    -0.768029
4     1.019049
5    -0.842927
6    -0.768029
7     0.679366
8     1.404879
9     1.333946
10   -0.679366
11    0.000000
Name: value, dtype: float64

In [141]:
g.apply(normalize)

0     0.210042
1    -0.475293
2    -1.276444
3    -1.470294
4    -0.855528
5    -0.303915
6     0.630126
7     1.425880
8     0.668614
9     0.630126
10   -0.095059
11    0.911746
Name: value, dtype: float64

## unwrapped gp function: for mixing a data element witn that of gp element

In [142]:
g.transform('mean')

0     9.00
1     9.25
2     7.25
3     9.00
4     9.25
5     7.25
6     9.00
7     9.25
8     7.25
9     9.00
10    9.25
11    7.25
Name: value, dtype: float64

In [128]:
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized

0     0.210042
1    -0.475293
2    -1.276444
3    -1.470294
4    -0.855528
5    -0.303915
6     0.630126
7     1.425880
8     0.668614
9     0.630126
10   -0.095059
11    0.911746
Name: value, dtype: float64

## techniques for Method Chaining: in case multiple opertions are performed so needed temporary variables. To avoid that!!

### Creating a copy of the data frame

In [208]:
df

Unnamed: 0,key,value
0,a,7
1,b,5
2,c,6
3,a,1
4,b,11
5,c,5
6,a,1
7,b,10
8,c,13
9,a,14


In [204]:
# Usual non-functional way
df2 = df.copy()
df2['k'] = 10
df2

Unnamed: 0,key,value,k
0,a,7,10
1,b,5,10
2,c,6,10
3,a,1,10
4,b,11,10
5,c,5,10
6,a,1,10
7,b,10,10
8,c,13,10
9,a,14,10


In [207]:
# Functional assign way
df2 = df.assign(k=100,value=30)
df2

Unnamed: 0,key,value,k
0,a,30,100
1,b,30,100
2,c,30,100
3,a,30,100
4,b,30,100
5,c,30,100
6,a,30,100
7,b,30,100
8,c,30,100
9,a,30,100


In [209]:
df2

Unnamed: 0,key,value,k
0,a,30,100
1,b,30,100
2,c,30,100
3,a,30,100
4,b,30,100
5,c,30,100
6,a,30,100
7,b,30,100
8,c,30,100
9,a,30,100


In [188]:
R = (df2.assign(col1_demeaned=df2.k - df2.value.mean()).groupby('key').col1_demeaned.std())

In [210]:
R=df2.assign(col1_demeaned=df2.k - df2.value.mean())

In [211]:
R

Unnamed: 0,key,value,k,col1_demeaned
0,a,30,100,70.0
1,b,30,100,70.0
2,c,30,100,70.0
3,a,30,100,70.0
4,b,30,100,70.0
5,c,30,100,70.0
6,a,30,100,70.0
7,b,30,100,70.0
8,c,30,100,70.0
9,a,30,100,70.0


### Creating dummy variables for modeling:  transform categorical data into dummy variables, also known as one-hot encoding. This involves creating a DataFrame with a column for each distinct category; these columns contain 1s for occurrences of a given category and 0 otherwise.

In [212]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')

In [213]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1
