In [1]:
# importing pandas and numpy

import pandas as pd

import numpy as np

In [2]:
# importing data

df = pd.read_csv(r'C:\Users\CharlesYi\Jupyter Notebook\Alteryx Challenges\Challenge 381_Average Monthly Sales with a Twist\Employee_sales_data.csv')

In [3]:
df.head()

Unnamed: 0,Employee ID,Employee Name,January,February,March,April,May,June,July,August,September,October,November,December
0,1,Barbara Somers,0,8,3,3,0,5,3,6,1,7,0,6
1,2,Benjamin Portal,2,6,2,3,4,4,3,3,1,5,4,5
2,3,Beth Silverstein,0,7,7,2,0,8,6,0,9,0,3,0
3,4,Carinne Cardona,2,4,1,0,3,3,5,1,9,7,1,6
4,5,Chahait Singh,0,0,12,0,0,0,10,9,0,3,0,2


In [4]:
df.shape

(50, 14)

In [5]:
# transposing on employee ID and employee name

df_pivoted = pd.melt(df, id_vars = ['Employee ID', 'Employee Name'], var_name = 'month', value_name = 'sales')

df_pivoted.head()

Unnamed: 0,Employee ID,Employee Name,month,sales
0,1,Barbara Somers,January,0
1,2,Benjamin Portal,January,2
2,3,Beth Silverstein,January,0
3,4,Carinne Cardona,January,2
4,5,Chahait Singh,January,0


In [6]:
# changing month to datetime

df_pivoted['month_num'] = pd.to_datetime(df_pivoted['month'], format = '%B')

df_pivoted.head()

Unnamed: 0,Employee ID,Employee Name,month,sales,month_num
0,1,Barbara Somers,January,0,1900-01-01
1,2,Benjamin Portal,January,2,1900-01-01
2,3,Beth Silverstein,January,0,1900-01-01
3,4,Carinne Cardona,January,2,1900-01-01
4,5,Chahait Singh,January,0,1900-01-01


In [7]:
# changing month_num to integer

df_pivoted['month_num'] = df_pivoted['month_num'].dt.strftime('%m')

df_pivoted.head()

Unnamed: 0,Employee ID,Employee Name,month,sales,month_num
0,1,Barbara Somers,January,0,1
1,2,Benjamin Portal,January,2,1
2,3,Beth Silverstein,January,0,1
3,4,Carinne Cardona,January,2,1
4,5,Chahait Singh,January,0,1


In [8]:
# sorting df by employee id, month num and resetting index

df_pivoted.sort_values(by = ['Employee ID', 'month_num'], ascending = True, inplace = True)

df_pivoted.reset_index(drop = True, inplace = True)

df_sorted = df_pivoted

In [9]:
# identifying first sales for each employee

def first_sales_func (group):
    
    first_sales = []
    
    for index, row in group.iterrows():
        
        if row['sales'] == 0:
            first_sales.append('not_first')
        
        elif row['sales'] != 0 and (not first_sales or 'first' not in first_sales):
            first_sales.append('first')
        
        elif row['sales'] != 0 and first_sales[-1] == 'first':
            first_sales.append('not_first')
            
        else:
            first_sales.append('not_first')
            
    series = pd.Series(first_sales, index=group.index)
    
    group['sequence'] = series
            
    return group['sequence']
        

In [10]:
df_first = df_sorted

In [11]:
# applying function

df_first['sequence'] = df_sorted.groupby('Employee ID').apply(first_sales_func).explode().reset_index(drop = True)


In [12]:
df_first.head(36)

Unnamed: 0,Employee ID,Employee Name,month,sales,month_num,sequence
0,1,Barbara Somers,January,0,1,not_first
1,1,Barbara Somers,February,8,2,first
2,1,Barbara Somers,March,3,3,not_first
3,1,Barbara Somers,April,3,4,not_first
4,1,Barbara Somers,May,0,5,not_first
5,1,Barbara Somers,June,5,6,not_first
6,1,Barbara Somers,July,3,7,not_first
7,1,Barbara Somers,August,6,8,not_first
8,1,Barbara Somers,September,1,9,not_first
9,1,Barbara Somers,October,7,10,not_first


In [31]:
# defining function to determine second and third sales

def subsequent_sales (group):
    
    subsequent_sales_list = []
    
    for index, row in group.iterrows():
        
        if row['sequence'] == 'first':
            subsequent_sales_list.append('first')
            
        elif subsequent_sales_list and subsequent_sales_list[-1] == 'first':
            subsequent_sales_list.append('second')
        
        elif subsequent_sales_list and subsequent_sales_list[-1] == 'second':
            subsequent_sales_list.append('third')
        
        else:
            subsequent_sales_list.append('disregard')
            
    series = pd.Series(subsequent_sales_list, index = group.index)
    
    group['sequence2'] = series
    
    return group['sequence2']

In [32]:
df_subsequent = df_first

In [38]:
# applying function

df_subsequent['sequence2'] = df_first.groupby('Employee ID').apply(subsequent_sales).explode().reset_index(drop = True)

In [40]:
df_subsequent.head(36)

Unnamed: 0,Employee ID,Employee Name,month,sales,month_num,sequence,sequence2
0,1,Barbara Somers,January,0,1,not_first,disregard
1,1,Barbara Somers,February,8,2,first,first
2,1,Barbara Somers,March,3,3,not_first,second
3,1,Barbara Somers,April,3,4,not_first,third
4,1,Barbara Somers,May,0,5,not_first,disregard
5,1,Barbara Somers,June,5,6,not_first,disregard
6,1,Barbara Somers,July,3,7,not_first,disregard
7,1,Barbara Somers,August,6,8,not_first,disregard
8,1,Barbara Somers,September,1,9,not_first,disregard
9,1,Barbara Somers,October,7,10,not_first,disregard


In [42]:
# filtering out disregards

df_filtered = df_subsequent[df_subsequent['sequence2'] != 'disregard']

df_filtered.shape

(150, 7)

In [58]:
# calculating average rate of first three months of sales and rounding for each employee

df_filtered.groupby(['Employee ID', 'Employee Name']).agg(avg_sales = ('sales', lambda x: round(x.mean(), 2))).reset_index()

Unnamed: 0,Employee ID,Employee Name,avg_sales
0,1,Barbara Somers,4.67
1,2,Benjamin Portal,3.33
2,3,Beth Silverstein,5.33
3,4,Carinne Cardona,2.33
4,5,Chahait Singh,4.0
5,6,Christina Decosta,3.33
6,7,Christopher Eten,2.0
7,8,Craig Coleman,5.0
8,9,Daniel Kohane,3.67
9,10,Dylan O'Rourke,11.0
