In [1]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

# Define possible values
departments = ['HR', 'IT', 'Finance', 'Marketing', 'Sales', 'Operations', 'Admin']
names = ["Alice Johnson", "Bob Smith", "Charlie Davis", "David White", "Emma Wilson",
         "Frank Harris", "Grace Lee", "Henry Walker", "Isabella Scott", "Jack Hall",
         "Karen Allen", "Liam Wright", "Mia King", "Noah Baker", "Olivia Adams",
         "Paul Nelson", "Quinn Carter", "Ryan Mitchell", "Sophia Perez", "Thomas Roberts"]
num_rows = 100

np.random.seed(20)
# Generate employee dataset
data = {
    'Employee_ID': [f'EMP{1000+i}' for i in range(num_rows)],
    'Name': [random.choice(names) for _ in range(num_rows)],
    'Age': [random.randint(22, 60) for _ in range(num_rows)],
    'Department': [random.choice(departments) for _ in range(num_rows)],
    'Joining_Date': [(datetime.today() - timedelta(days=random.randint(0, 3650))).date() for _ in range(num_rows)],
    'Salary': [random.randint(30000, 150000) for _ in range(num_rows)],
    'Experience': [random.randint(0, 40) for _ in range(num_rows)],  # Years of experience
    'Performance_Score': [random.randint(1, 5) for _ in range(num_rows)],  # Rating from 1 to 5
    'Remote_Work_Eligible': [random.choice([True, False]) for _ in range(num_rows)]
}

# Create DataFrame
df = pd.DataFrame(data)


In [2]:
# Functions of DataFrames
# Str and Date in DataFrames
# Apply functions with DataFrames - Small
# Iterrows - Small
# Groupby 
# Groupby with apply and transform
# Pivot tables

In [4]:
df.head(20)

Unnamed: 0,Employee_ID,Name,Age,Department,Joining_Date,Salary,Experience,Performance_Score,Remote_Work_Eligible
0,EMP1000,Thomas Roberts,53,HR,2018-12-14,54880,30,2,True
1,EMP1001,David White,42,HR,2022-12-03,90141,5,1,False
2,EMP1002,Liam Wright,53,HR,2015-04-02,141395,19,1,False
3,EMP1003,Jack Hall,33,Finance,2018-03-22,57498,0,5,True
4,EMP1004,Thomas Roberts,40,Admin,2018-09-28,57287,7,4,True
5,EMP1005,Alice Johnson,23,HR,2016-01-21,110064,37,5,True
6,EMP1006,Noah Baker,44,Admin,2024-02-23,82946,0,5,True
7,EMP1007,Bob Smith,56,HR,2021-03-14,86464,25,2,True
8,EMP1008,Emma Wilson,35,Admin,2022-08-28,52893,22,4,False
9,EMP1009,Grace Lee,52,Operations,2019-12-07,88626,24,3,False


In [5]:
# Use apply function on Salary and classify the salary as high medium and low
# if <50K - Low, if <80 - Medium else High
def func(x):
    if(x<50000):
        return 'Low'
    elif(x<80000):
        return 'Medium'
    else:
        return 'High'

In [9]:
df['salary_classification'] = df['Salary'].apply(func)

In [15]:
lst = df.loc[0].tolist()

In [16]:
# Use this list to reduce the salary by 10% if performance score <=2 -- it's okay to use index numbers, write a function
# write a function that takes list and returns updated salary basis above condition
def adjust_salary(lst):
    if(lst[7]<=2):
        return lst[5]*0.9
    else:
        return lst[5]

In [17]:
adjust_salary(lst)

49392.0

In [18]:
ds = df.loc[0]
ds

Employee_ID                     EMP1000
Name                     Thomas Roberts
Age                                  53
Department                           HR
Joining_Date                 2018-12-14
Salary                            54880
Experience                           30
Performance_Score                     2
Remote_Work_Eligible               True
salary_classification            Medium
Name: 0, dtype: object

In [25]:
# Write the same function for Data Series - THe function takes Data Series as input and returns salary corrected
def adj_sal(S):
    if(S['Performance_Score']<=2):
        return S['Salary']*0.9
    else:
        return S['Salary']

In [26]:
adj_sal(ds)

49392.0

In [29]:
df

Unnamed: 0,Employee_ID,Name,Age,Department,Joining_Date,Salary,Experience,Performance_Score,Remote_Work_Eligible,salary_classification
0,EMP1000,Thomas Roberts,53,HR,2018-12-14,54880,30,2,True,Medium
1,EMP1001,David White,42,HR,2022-12-03,90141,5,1,False,High
2,EMP1002,Liam Wright,53,HR,2015-04-02,141395,19,1,False,High
3,EMP1003,Jack Hall,33,Finance,2018-03-22,57498,0,5,True,Medium
4,EMP1004,Thomas Roberts,40,Admin,2018-09-28,57287,7,4,True,Medium
...,...,...,...,...,...,...,...,...,...,...
95,EMP1095,Charlie Davis,54,HR,2022-09-04,145789,3,4,False,High
96,EMP1096,Grace Lee,36,Marketing,2022-09-16,43754,10,5,False,Low
97,EMP1097,Noah Baker,58,Finance,2021-01-19,102547,34,1,True,High
98,EMP1098,Mia King,26,Marketing,2024-05-18,139971,26,2,False,High


In [28]:
df.apply(adj_sal, axis = 1)

0      49392.0
1      81126.9
2     127255.5
3      57498.0
4      57287.0
        ...   
95    145789.0
96     43754.0
97     92292.3
98    125973.9
99     55574.0
Length: 100, dtype: float64

In [39]:
# Retire by department - HR age >50 ->retire, Marketing age >40 -> retire, Department admin age>45 -> retire, don't return anything otherwise.
ds = df.loc[1]
ds

Employee_ID                  EMP1001
Name                     David White
Age                               42
Department                        HR
Joining_Date              2022-12-03
Salary                         90141
Experience                         5
Performance_Score                  1
Remote_Work_Eligible           False
salary_classification           High
Name: 1, dtype: object

In [47]:
def decide_retirement(ds):
    if((ds['Department'] == 'HR') & (ds['Age']>50)):
        return 'retire'
    elif((ds['Department'] == 'Marketing') & (ds['Age']>40)):
        return 'retire'
    elif((ds['Department'] == 'Admin') & (ds['Age']>45)):
        return 'retire'
    else:
        return 'Do not retire'

In [48]:
df.apply(decide_retirement, axis = 1)

0            retire
1     Do not retire
2            retire
3     Do not retire
4     Do not retire
          ...      
95           retire
96    Do not retire
97    Do not retire
98    Do not retire
99    Do not retire
Length: 100, dtype: object

In [None]:
# Question is cap the values of salary between 60000 and 100000 but you can't use 60000 and 10000 as numbers inside the function
#  use the bounds as variables

In [51]:
def func(x, lb, ub):
    if(x<lb):
        return lb
    elif(x>ub):
        return ub
    else:
        return x

In [59]:

new_func = lambda x: func(x, 60000, 100000)

In [63]:
# df['Salary'].apply(func(x, 50000, 90000))
lb = 50000
ub = 90000
df['Salary'].apply(lambda x: func(x, 50000, 90000))

NameError: name 'x' is not defined

In [65]:
lb = 50000
ub = 90000
df['Salary'].apply(func, lb = lb, ub = ub)

0     54880
1     90000
2     90000
3     57498
4     57287
      ...  
95    90000
96    50000
97    90000
98    90000
99    55574
Name: Salary, Length: 100, dtype: int64

In [67]:
import pandas as pd
import numpy as np

# Creating sample data
data = {
    "Customer_ID": np.random.randint(1001, 1050, 50),
    "City": np.random.choice(["New York", "Los Angeles", "Chicago", "Houston", "San Francisco"], 50),
    "Product_Category": np.random.choice(["Electronics", "Clothing", "Furniture", "Beauty", "Groceries"], 50),
    "Product": np.random.choice(["Laptop", "Shoes", "Table", "Face Cream", "Vegetables", "Headphones", "Sofa", "Jacket", "Smartphone"], 50),
    "Revenue": np.random.randint(50, 1000, 50),
    "Discount": np.random.randint(5, 30, 50),
    "Customer_Rating": np.round(np.random.uniform(1, 5, 50), 1),
    "Purchase_Date": pd.date_range(start="2024-01-01", periods=50, freq="D"),
}

# Creating the DataFrame
df = pd.DataFrame(data)



In [69]:
df = df.head(10)

In [73]:
df.to_clipboard()

In [74]:
grouped = df.groupby('City')

In [76]:
grouped.indices

{'Chicago': array([2, 3, 5, 9], dtype=int64),
 'Houston': array([1], dtype=int64),
 'Los Angeles': array([0, 7, 8], dtype=int64),
 'New York': array([6], dtype=int64),
 'San Francisco': array([4], dtype=int64)}

In [77]:
grouped.groups

{'Chicago': [2, 3, 5, 9], 'Houston': [1], 'Los Angeles': [0, 7, 8], 'New York': [6], 'San Francisco': [4]}

In [78]:
grouped.ngroups

5

In [80]:
lst = [1,2,3]

In [81]:
dir(lst)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [75]:
dir(grouped)

['City',
 'Customer_ID',
 'Customer_Rating',
 'Discount',
 'Product',
 'Product_Category',
 'Purchase_Date',
 'Revenue',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_accessors',
 '_agg_examples_doc',
 '_agg_general',
 '_agg_py_fallback',
 '_aggregate_frame',
 '_aggregate_item_by_item',
 '_aggregate_with_numba',
 '_apply_allowlist',
 '_apply_filter',
 '_apply_to_column_groupbys',
 '_bool_agg',
 '_can_use_transform_fast',
 '_choose_path',
 '_concat_objects',
 '_constructor',
 '_cumcount_array',
 '_cython_agg_general',

In [92]:
df.groupby('City')['Discount'].max()
df.groupby('City')[['Discount']].max().reset_index()
df.groupby('City')[['Discount']].max().reset_index(drop = True)

df.groupby('City')['Discount'].max().reset_index()

Unnamed: 0,City,Discount
0,Chicago,27
1,Houston,16
2,Los Angeles,20
3,New York,28
4,San Francisco,22


In [94]:
# For every product find min customer rating and min revenue
df.groupby('Product_Category')[['Customer_Rating', 'Revenue']].min()
# I don't want index - 
df.groupby('Product_Category')[['Customer_Rating', 'Revenue']].min().reset_index()


Unnamed: 0,Product_Category,Customer_Rating,Revenue
0,Beauty,2.1,353
1,Clothing,1.1,397
2,Electronics,1.0,179
3,Furniture,1.1,132
4,Groceries,3.5,310


In [95]:
# Slight problem - I'm getting the aggregated value column name same as actual column name
# What if I want a custom name like customer_rating_min
df.groupby('Product_Category').agg(customer_rating_min = ('Customer_Rating', 'min'), revenue_min = ('Revenue', 'min'))

Unnamed: 0_level_0,customer_rating_min,revenue_min
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Beauty,2.1,353
Clothing,1.1,397
Electronics,1.0,179
Furniture,1.1,132
Groceries,3.5,310


In [96]:
# I want for each product category min of customer rating and max of revenue
# Can't be done normally
df.groupby('Product_Category')[['Customer_Rating', 'Revenue']].min()
# Can be done using agg
df.groupby('Product_Category').agg(customer_rating_min = ('Customer_Rating', 'min'), max_revenue = ('Revenue', 'max'))

Unnamed: 0_level_0,customer_rating_min,max_revenue
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Beauty,2.1,353
Clothing,1.1,414
Electronics,1.0,974
Furniture,1.1,216
Groceries,3.5,310


In [97]:
# 1. Total Revenue and Average Discount per Product Category
df.groupby('Product_Category').agg(total_revenue = ('Revenue', 'sum'), avg_discount = ('Discount', 'mean'))

Unnamed: 0_level_0,total_revenue,avg_discount
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Beauty,353,18.0
Clothing,811,9.0
Electronics,2823,21.5
Furniture,348,21.0
Groceries,310,22.0


In [None]:
# Number of Purchases and Average Customer Rating per City


In [None]:
# Next session 
# Group by apply Group by trasform 
# String operations and date operations
# Pivots

# Merge - on your own

In [1]:
import pandas as pd

data_small = {
    'Department': ['Sales', 'Sales', 'Sales', 'HR', 'HR', 'HR', 'IT', 'IT'],
    'Employee': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
    'Salary': [50000, 55000, 60000, 48000, 50000, 52000, 70000, 72000],
    'Experience': [2, 3, 5, 1, 2, 4, 6, 8]
}

df = pd.DataFrame(data_small)


In [2]:
df

Unnamed: 0,Department,Employee,Salary,Experience
0,Sales,A,50000,2
1,Sales,B,55000,3
2,Sales,C,60000,5
3,HR,D,48000,1
4,HR,E,50000,2
5,HR,F,52000,4
6,IT,G,70000,6
7,IT,H,72000,8


In [4]:
# Find out how much each employees salary is more as compared to employee with minimum salary
df['Salary'] - df['Salary'].min()

0     2000
1     7000
2    12000
3        0
4     2000
5     4000
6    22000
7    24000
Name: Salary, dtype: int64

In [22]:
def diff_salary(x, min_salary):
    return x-min_salary
min_sal = df['Salary'].min()
df['Salary'].apply(diff_salary, min_salary = min_sal)

0     2000
1     7000
2    12000
3        0
4     2000
5     4000
6    22000
7    24000
Name: Salary, dtype: int64

In [9]:
def diff_salary_group(ds):
    return ds - ds.min()

In [10]:
# Find how much each employees salary is more than the person with minimum salary in the same department
df.groupby('Department')['Salary'].apply(diff_salary_group)

0        0
1     5000
2    10000
3        0
4     2000
5     4000
6        0
7     2000
Name: Salary, dtype: int64

In [15]:
def sample_apply(ds):
    print(ds)
    return pd.Series([0,1,2,3,4,5,6])

In [16]:
df.groupby('Department')['Salary'].apply(sample_apply)

3    48000
4    50000
5    52000
Name: HR, dtype: int64
6    70000
7    72000
Name: IT, dtype: int64
0    50000
1    55000
2    60000
Name: Sales, dtype: int64


Department   
HR          0    0
            1    1
            2    2
            3    3
            4    4
            5    5
            6    6
IT          0    0
            1    1
            2    2
            3    3
            4    4
            5    5
            6    6
Sales       0    0
            1    1
            2    2
            3    3
            4    4
            5    5
            6    6
Name: Salary, dtype: int64

In [18]:
def diff_salary_group(ds):
    return (ds - ds.min()).mean()

In [20]:
# We get back a dataseries but with new indexes
df.groupby('Department')['Salary'].apply(diff_salary_group)
# We get back a dataframe with new indexes
df.groupby('Department')['Salary'].apply(diff_salary_group)

Unnamed: 0_level_0,Salary
Department,Unnamed: 1_level_1
HR,2000.0
IT,1000.0
Sales,5000.0


In [None]:
# 1. If the number of values going = no of values coming back the original indexes will become indexes of new values coming back
# 2. If the number of values going != no of values coming back then new indexes will be created by group names

In [None]:
def random_func(x):
    return x**2

df['Experience'].apply(random_func)
# Is there any group? - int will go and int will come back (dataseries will come back accumulated)

df.groupby('Department')['Experience'].apply(random_func)

# Is there a group? - Each group's data series will go, each group's data series will come back - indexes - Original index
df.groupby('Department')[['Experience']].apply(random_func)

# Tell me in all three what is the data type going in the function and coming from the function



In [None]:
def random_fun(x):
    return x.mean()**2


df['Experience'].apply(random_func) 
# Error

df.groupby('Department')['Experience'].apply(random_func)
# Data Series will go, One value for each data series will come back - INdex - Group name - Combined to make a data series

df.groupby('Department')[['Experience']].apply(random_func)


In [23]:
df

Unnamed: 0,Department,Employee,Salary,Experience
0,Sales,A,50000,2
1,Sales,B,55000,3
2,Sales,C,60000,5
3,HR,D,48000,1
4,HR,E,50000,2
5,HR,F,52000,4
6,IT,G,70000,6
7,IT,H,72000,8


In [None]:
def random_func(x):
    return x['Experience']**2
    
df.apply(random_func, axis = 1)

In [24]:
import numpy as np

np.random.seed(42)
data_large = {
    'Department': np.random.choice(['Sales', 'HR', 'IT', 'Finance'], 500),
    'Employee_ID': np.arange(1, 501),
    'Salary': np.random.randint(40000, 100000, 500),
    'Experience': np.random.randint(1, 15, 500),
    'Performance_Score': np.random.uniform(1, 5, 500)
}

df_large = pd.DataFrame(data_large)


In [25]:
df_large

Unnamed: 0,Department,Employee_ID,Salary,Experience,Performance_Score
0,IT,1,40190,1,3.305153
1,Finance,2,50492,11,3.426860
2,Sales,3,90132,5,2.696523
3,IT,4,75743,10,3.945777
4,IT,5,46102,9,4.737468
...,...,...,...,...,...
495,IT,496,77892,8,1.418271
496,IT,497,41015,5,3.545721
497,Finance,498,99168,8,3.825903
498,Sales,499,67712,10,1.126345


In [28]:
def adjust_salary(ds):
    if(ds.mean()<60000):
        return ds*1.1
    else:
        return ds

In [31]:
#  increase the salary of each employee by 10% if the avg salary of a department is less than 60000
df_large.loc[df_large['Department'] == 'IT']['Salary']
df_large.groupby('Department')['Salary'].apply(adjust_salary)

# When number of values going in and coming back are same, you can use transform instead of apply

0      40190
1      50492
2      90132
3      75743
4      46102
       ...  
495    77892
496    41015
497    99168
498    67712
499    74961
Name: Salary, Length: 500, dtype: int32

In [None]:
#  increase the salary of each employee by 10% if the avg Performance_Score of a department is greater than 4

In [33]:
df.loc[df['Department'] == 'Sales']

Unnamed: 0,Department,Employee,Salary,Experience
0,Sales,A,50000,2
1,Sales,B,55000,3
2,Sales,C,60000,5


In [34]:
def increase_salary(dff):
    if(dff['Performance_Score'].mean()>4):
        return dff['Salary'] * 1.1
    else:
        return dff['Salary']

In [36]:
df_large.groupby('Department').apply(increase_salary)

Department     
Finance     1      50492
            5      90336
            14     40569
            16     74663
            17     91885
                   ...  
Sales       489    40009
            491    98141
            492    63793
            494    69548
            498    67712
Name: Salary, Length: 500, dtype: int32

In [None]:
# apply - 
# 1. With Data series without grouping
# 2. With Data frame with axis without grouping
# 3. With grouping and Data series
# 4. With grouping and dataframe

In [38]:
# Question for each Department sample the data keeping only top 2 rows of each department
df[df['Department'] == 'Sales']

Unnamed: 0,Department,Employee,Salary,Experience
0,Sales,A,50000,2
1,Sales,B,55000,3
2,Sales,C,60000,5


In [39]:
def sample_custom(df):
    return df.head(2)

In [42]:
df_large.groupby('Department').apply(sample_custom).reset_index(drop = True)

Unnamed: 0,Department,Employee_ID,Salary,Experience,Performance_Score
0,Finance,2,50492,11,3.42686
1,Finance,6,90336,6,4.702274
2,HR,10,66641,13,4.355592
3,HR,21,47455,5,2.333997
4,IT,1,40190,1,3.305153
5,IT,4,75743,10,3.945777
6,Sales,3,90132,5,2.696523
7,Sales,7,92479,11,2.803357


In [43]:
# Random sampling by department
def random_grp_sampling(dff):
    return dff.sample(2)


In [46]:
df_large.groupby('Department').apply(random_grp_sampling)

Unnamed: 0_level_0,Unnamed: 1_level_0,Department,Employee_ID,Salary,Experience,Performance_Score
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Finance,88,Finance,89,59508,9,3.889068
Finance,55,Finance,56,80941,3,2.888268
HR,421,HR,422,67509,7,4.774464
HR,319,HR,320,92788,1,4.822606
IT,39,IT,40,59190,13,3.654149
IT,3,IT,4,75743,10,3.945777
Sales,137,Sales,138,60056,10,3.306066
Sales,247,Sales,248,43267,11,1.085078


In [48]:
# iqr = q3-q1

# Cap outliers group wise - i.e the values which are above 1.5*IQR, then cap it to 1.5*IQR
df['Salary'].quantile(0.25)

50000.0

In [58]:
def cap_groupwise(ds):
    IQR = ds.quantile(0.75) - ds.quantile(0.25)
    ds[ds>1.5*IQR] = 1.5*IQR 
    return ds

In [60]:
df_large.groupby('Department')['Salary'].apply(cap_groupwise)

0      40190.000
1      49908.375
2      35517.375
3      44052.375
4      44052.375
         ...    
495    44052.375
496    41015.000
497    49908.375
498    35517.375
499    49908.375
Name: Salary, Length: 500, dtype: float64

In [50]:
df[df['Department'] == 'Sales']

Unnamed: 0,Department,Employee,Salary,Experience
0,Sales,A,50000,2
1,Sales,B,55000,3
2,Sales,C,60000,5


In [63]:
def cap_outliers(gr):
    Q1 = gr.quantile(0.25)
    Q3 = gr.quantile(0.75)
    IQR = Q3 - Q1
    cap = 1.5 * IQR
    gr[gr > cap] = cap 
    return gr

In [67]:
df_large.groupby(['Department'])['Salary'].apply(cap_outliers)

0      40190.000
1      49908.375
2      35517.375
3      44052.375
4      44052.375
         ...    
495    44052.375
496    41015.000
497    49908.375
498    35517.375
499    49908.375
Name: Salary, Length: 500, dtype: float64

In [70]:
def cap_outliers(dff):
    capiqr = (dff['Salary'].quantile(0.75) - dff['Salary'].quantile(0.25))*1.5
    dff.loc[dff['Salary']>capiqr, 'Salary'] =  capiqr
    return dff['Salary']

In [71]:
df_large.groupby('Department').apply(cap_outliers)

Department     
Finance     1      49908.375
            5      49908.375
            14     40569.000
            16     49908.375
            17     49908.375
                     ...    
Sales       489    35517.375
            491    35517.375
            492    35517.375
            494    35517.375
            498    35517.375
Name: Salary, Length: 500, dtype: float64

In [73]:
def cap_num(ds):
    IQR=ds.quantile(0.75) - ds.quantile(0.25)
    ds[ds>1.5*IQR] = 1.5*IQR
    return ds

In [77]:
ds = df.loc[df['Department'] == 'Sales', 'Salary']

In [78]:
ds

0    50000
1    55000
2    60000
Name: Salary, dtype: int64

In [None]:
# Wrie a function to return the IQR of each group's Salary

In [82]:
def iqr(ds):
    return 1.5*(ds.quantile(0.75) - ds.quantile(0.25))

In [83]:
df_large.groupby('Department')['Salary'].apply(iqr)

Department
Finance    49908.375
HR         40079.250
IT         44052.375
Sales      35517.375
Name: Salary, dtype: float64

In [85]:
df_large.groupby('Department').agg(salary_mean = ('Salary', 'mean'), salary_iqr = ('Salary', iqr))

Unnamed: 0_level_0,salary_mean,salary_iqr
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,71376.040541,49908.375
HR,71423.009259,40079.25
IT,66972.557377,44052.375
Sales,68935.590164,35517.375


In [None]:
## Date time functions

In [87]:
import pandas as pd
import numpy as np

# Creating a DataFrame with random dates
np.random.seed(42)
date_rng = pd.date_range(start='2022-01-01', end='2024-12-31', freq='W')

df = pd.DataFrame({
    'Employee': np.random.choice(['A', 'B', 'C', 'D', 'E'], size=len(date_rng)),
    'Department': np.random.choice(['Sales', 'HR', 'IT'], size=len(date_rng)),
    'Join_Date': np.random.choice(date_rng, size=len(date_rng)),
    'Last_Promotion': np.random.choice(date_rng, size=len(date_rng)),
    'Salary': np.random.randint(40000, 120000, size=len(date_rng))
})

# Converting date columns to datetime
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
df['Last_Promotion'] = pd.to_datetime(df['Last_Promotion'])

df.head()


Unnamed: 0,Employee,Department,Join_Date,Last_Promotion,Salary
0,D,Sales,2022-02-06,2022-07-17,78088
1,E,Sales,2023-11-19,2024-09-08,118752
2,C,IT,2024-05-12,2022-03-20,95284
3,E,IT,2022-09-11,2023-10-22,97043
4,E,HR,2022-06-12,2022-11-13,75547


In [88]:
df.dtypes

Employee                  object
Department                object
Join_Date         datetime64[ns]
Last_Promotion    datetime64[ns]
Salary                     int32
dtype: object

In [92]:
df['Join_Date'].dt.year

0      2022
1      2023
2      2024
3      2022
4      2022
       ... 
152    2023
153    2023
154    2023
155    2022
156    2022
Name: Join_Date, Length: 157, dtype: int64

In [93]:
df['Join_Date'].dt.month

0       2
1      11
2       5
3       9
4       6
       ..
152     4
153     3
154     5
155    10
156    10
Name: Join_Date, Length: 157, dtype: int64

In [94]:
df['Join_Date'].dt.day

0       6
1      19
2      12
3      11
4      12
       ..
152     9
153    26
154    28
155    23
156    30
Name: Join_Date, Length: 157, dtype: int64

In [95]:
# If you want Name of the day
df['Join_Date'].dt.day_name()

0      Sunday
1      Sunday
2      Sunday
3      Sunday
4      Sunday
        ...  
152    Sunday
153    Sunday
154    Sunday
155    Sunday
156    Sunday
Name: Join_Date, Length: 157, dtype: object

In [103]:
# Arithematic operations between dates
# Difference between Last promotion date and join date
(df['Last_Promotion'] - df['Join_Date'])
# THe data type of above is not integer - type is time delta
# To convert it to integer
(df['Last_Promotion'] - df['Join_Date']).dt.days

dtype('int64')

In [109]:
# Finding the gap in months year and days
(df['Last_Promotion'].dt.to_period('M') - df['Join_Date'].dt.to_period('M')).apply(lambda x: x.n)

0       5
1      10
2     -26
3      13
4       5
       ..
152    -9
153     9
154    -7
155    26
156    24
Length: 157, dtype: int64

In [110]:
df['Last_Promotion'].dt.month_name()

0           July
1      September
2          March
3        October
4       November
         ...    
152         July
153     December
154      October
155     December
156      October
Name: Last_Promotion, Length: 157, dtype: object

In [120]:
# Changing the format of Date
df['Join_Date'].dt.strftime('%A,%B,%Y')

0       Sunday,February,2022
1       Sunday,November,2023
2            Sunday,May,2024
3      Sunday,September,2022
4           Sunday,June,2022
               ...          
152        Sunday,April,2023
153        Sunday,March,2023
154          Sunday,May,2023
155      Sunday,October,2022
156      Sunday,October,2022
Name: Join_Date, Length: 157, dtype: object

In [122]:
# FInd the employees where last promotion date was on a weekend
lolo = df['Last_Promotion'].dt.day_name().isin(['Sunday', 'Saturday'])
df[lolo]

Unnamed: 0,Employee,Department,Join_Date,Last_Promotion,Salary
0,D,Sales,2022-02-06,2022-07-17,78088
1,E,Sales,2023-11-19,2024-09-08,118752
2,C,IT,2024-05-12,2022-03-20,95284
3,E,IT,2022-09-11,2023-10-22,97043
4,E,HR,2022-06-12,2022-11-13,75547
...,...,...,...,...,...
152,A,Sales,2023-04-09,2022-07-10,104291
153,A,HR,2023-03-26,2023-12-03,119104
154,C,Sales,2023-05-28,2022-10-09,99399
155,A,HR,2022-10-23,2024-12-15,119714


In [128]:
# pd.to_datetime
# 2022-02-06 -- this is the right format for pandas

x = '23-06-2024' 
# This needs to be converted

current_format = '%d-%m-%Y'
pd.to_datetime(x, format = current_format).to_pydatetime()


AttributeError: 'datetime.datetime' object has no attribute 'dt'

In [129]:
df = pd.DataFrame({'Date1': ['23-03-2024', '23-04-2025', '24-05-2025']})

In [133]:
pd.to_datetime(df['Date1'], format = current_format).dt.day_name()

0     Saturday
1    Wednesday
2     Saturday
Name: Date1, dtype: object