In [37]:
"""
Solution approach 

1. Group the csv data by ai, newhire and control groups
2. For each group, calculate the phase1-phase2-difference for each team within the group. Then calculate the mean of the difference for each team.
3. Now for each group, you have the mean difference for all the teams within the group. 
4. Compare the difference across groups mathematically, using ANOVA Test.

"""

'\nSolution approach \n\n1. Group the csv data by ai, newhire and control groups\n2. For each group, calculate the phase1-phase2-difference for each team within the group. Then calculate the mean of the difference for each team.\n3. Now for each group, you have the mean difference for all the teams within the group. \n4. Compare the difference across groups mathematically, using ANOVA Test.\n\n'

In [38]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway

In [39]:
df = pd.read_csv("mario.csv")

In [40]:

# Group the data by 'group'
grouped_data = df.groupby('group')

# Create DataFrames for each group
ai_group = grouped_data.get_group('ai')
newhire_group = grouped_data.get_group('newhire')
control_group = grouped_data.get_group('control')

# Display the first few rows of each group (optional)
print("AI Group:")
print(ai_group.head())

print("\nNewhire Group:")
print(newhire_group.head())

print("\nControl Group:")
print(control_group.head())


AI Group:
     team_id  phase group  round  totalingred
180   101602      1    ai      1           20
181   101602      1    ai      2           19
182   101602      1    ai      3           23
183   101602      1    ai      4           19
184   101602      1    ai      5           22

Newhire Group:
   team_id  phase    group  round  totalingred
0    21301      1  newhire      1           19
1    21301      1  newhire      2           23
2    21301      1  newhire      3           23
3    21301      1  newhire      4           25
4    21301      1  newhire      5           24

Control Group:
     team_id  phase    group  round  totalingred
252   102901      1  control      1           24
253   102901      1  control      2           27
254   102901      1  control      3           28
255   102901      1  control      4           31
256   102901      1  control      5           26


In [41]:
def calculate_mean_of_difference_of_two_phases(group_df):
    phase1 = group_df[group_df['phase'] == 1]['totalingred'].values
    phase2 = group_df[group_df['phase'] == 2]['totalingred'].values
    difference = phase1 - phase2
    return difference.mean()

In [42]:
ai_datas = []
for team_id, team_id_group_df in ai_group.groupby("team_id"):
    data = {}
    data["team_id"] = team_id
    data["mean_of_difference_of_two_phases"] = calculate_mean_of_difference_of_two_phases(team_id_group_df)
    ai_datas.append(data)
    

In [43]:
ai_datadf = pd.DataFrame(ai_datas)
ai_datadf

Unnamed: 0,team_id,mean_of_difference_of_two_phases
0,101602,-7.166667
1,101603,-0.833333
2,102401,-3.5
3,102410,-2.166667
4,102411,-2.166667
5,102412,-2.333333
6,110510,-4.833333
7,110511,-3.5
8,110811,-2.666667
9,110812,-3.833333


In [44]:
newhire_datas = []
for team_id, team_id_group_df in newhire_group.groupby("team_id"):
    data = {}
    data["team_id"] = team_id
    data["mean_of_difference_of_two_phases"] = calculate_mean_of_difference_of_two_phases(team_id_group_df)
    newhire_datas.append(data)
    

In [45]:
newhire_datadf= pd.DataFrame(newhire_datas)
newhire_datadf

Unnamed: 0,team_id,mean_of_difference_of_two_phases
0,21301,-5.5
1,21312,-7.166667
2,21410,1.666667
3,21411,-4.333333
4,21412,-6.0
5,22010,-2.833333
6,22011,-4.166667
7,22012,-3.166667
8,22412,-5.666667
9,22512,-6.333333


In [46]:
control_datas = []
for team_id, team_id_group_df in control_group.groupby("team_id"):
    data = {}
    data["team_id"] = team_id
    data["mean_of_difference_of_two_phases"] = calculate_mean_of_difference_of_two_phases(team_id_group_df)
    control_datas.append(data)
    

In [47]:
control_datadf = pd.DataFrame(control_datas)
control_datadf

Unnamed: 0,team_id,mean_of_difference_of_two_phases
0,102901,-6.333333
1,102910,-4.666667
2,103110,-3.666667
3,103111,-5.333333
4,103112,-4.333333
5,110801,-1.333333
6,111401,-3.833333
7,111501,-5.0
8,111512,-4.833333
9,111901,-3.833333


In [48]:
ai_mean_of_differences_of_two_phases = ai_datadf["mean_of_difference_of_two_phases"].values
newhire_mean_of_differences_of_two_phases = newhire_datadf["mean_of_difference_of_two_phases"].values
control_mean_of_differences_of_two_phases = control_datadf["mean_of_difference_of_two_phases"].values

In [49]:
ai_mean_of_differences_of_two_phases

array([-7.16666667, -0.83333333, -3.5       , -2.16666667, -2.16666667,
       -2.33333333, -4.83333333, -3.5       , -2.66666667, -3.83333333,
       -1.        , -4.33333333, -2.5       , -2.66666667, -3.        ,
       -3.        , -2.5       , -2.16666667, -5.5       , -2.5       ])

In [50]:
newhire_mean_of_differences_of_two_phases

array([-5.5       , -7.16666667,  1.66666667, -4.33333333, -6.        ,
       -2.83333333, -4.16666667, -3.16666667, -5.66666667, -6.33333333,
       -4.33333333, -4.        , -5.83333333,  1.16666667,  1.16666667])

In [51]:
control_mean_of_differences_of_two_phases

array([-6.33333333, -4.66666667, -3.66666667, -5.33333333, -4.33333333,
       -1.33333333, -3.83333333, -5.        , -4.83333333, -3.83333333,
       -4.5       , -4.66666667, -4.16666667, -5.16666667, -5.66666667,
       -2.83333333, -4.33333333, -2.16666667, -3.16666667, -6.33333333])

In [52]:

f_oneway(
    ai_mean_of_differences_of_two_phases,
    newhire_mean_of_differences_of_two_phases,
    control_mean_of_differences_of_two_phases
)

F_onewayResult(statistic=2.0048468025567834, pvalue=0.14495854284025797)

In [53]:
"""
Null Hypothesis H0: There is no significant difference in mean performance among the three groups.
Alternative Hypothesis H1: There is a significant difference.

The result of the ANOVA test is as follows:

F-statistic: 2.0048468025567834
p-value: 0.14495854284025797

Interpretation:
The p-value of 0.11 is greater than the common significance level of 0.05. 
Therefore, based on this result, there is not enough evidence to reject the null hypothesis.

Conclusion:
The ANOVA test suggests that, at a significance level of 0.05, 
there is no significant difference in the mean number of collected ingredients among the 
three groups in phase 2 of the experiment."""

'\nNull Hypothesis H0: There is no significant difference in mean performance among the three groups.\nAlternative Hypothesis H1: There is a significant difference.\n\nThe result of the ANOVA test is as follows:\n\nF-statistic: 2.0048468025567834\np-value: 0.14495854284025797\n\nInterpretation:\nThe p-value of 0.11 is greater than the common significance level of 0.05. \nTherefore, based on this result, there is not enough evidence to reject the null hypothesis.\n\nConclusion:\nThe ANOVA test suggests that, at a significance level of 0.05, \nthere is no significant difference in the mean number of collected ingredients among the \nthree groups in phase 2 of the experiment.'