In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, t
import matplotlib.pyplot as plt
import pingouin

### Hypotheses
Are users who first programmed as a child compensated higher than those that started as adults?


- H0: The mean compensation (in USD) is the same for those that coded first as a child and those that coded first as an adult.

    H0: mu_child = mu_adult (OR mu_child - mu_adult = 0)

- HA: The mean compensation (in USD) is greater for those that coded first as a child and those that coded first as an adult. 

    HA: mu_child > mu_adult (OR mu_child - mu_adult > 0)

Alpha = 0.1

### Read data

In [2]:
df = pd.read_feather(r"..\data\stack_overflow.feather")  

In [3]:
df.head(3)

Unnamed: 0,respondent,main_branch,hobbyist,age,age_1st_code,age_first_code_cut,comp_freq,comp_total,converted_comp,country,...,survey_length,trans,undergrad_major,webframe_desire_next_year,webframe_worked_with,welcome_change,work_week_hrs,years_code,years_code_pro,age_cat
0,36.0,"I am not primarily a developer, but I write co...",Yes,34.0,30.0,adult,Yearly,60000.0,77556.0,United Kingdom,...,Appropriate in length,No,"Computer science, computer engineering, or sof...",Express;React.js,Express;React.js,Just as welcome now as I felt last year,40.0,4.0,3.0,At least 30
1,47.0,I am a developer by profession,Yes,53.0,10.0,child,Yearly,58000.0,74970.0,United Kingdom,...,Appropriate in length,No,"A natural science (such as biology, chemistry,...",Flask;Spring,Flask;Spring,Just as welcome now as I felt last year,40.0,43.0,28.0,At least 30
2,69.0,I am a developer by profession,Yes,25.0,12.0,child,Yearly,550000.0,594539.0,France,...,Too short,No,"Computer science, computer engineering, or sof...",Django;Flask,Django;Flask,Just as welcome now as I felt last year,40.0,13.0,3.0,Under 30


In [4]:
alpha = 0.1

In [5]:
# Calculating groupwise summary statistics
df.groupby("age_first_code_cut")["converted_comp"].mean()

age_first_code_cut
adult    111313.311047
child    132419.570621
Name: converted_comp, dtype: float64

<img src="..\data\standardising_test_statistic.jpg" width="500" height="600">

In [6]:
xbar = df.groupby("age_first_code_cut")["converted_comp"].mean()
s = df.groupby("age_first_code_cut")["converted_comp"].std()
n = df.groupby("age_first_code_cut")["converted_comp"].count()
numerator = xbar['child'] - xbar['adult']
denominator = np.sqrt(s["child"] **2 / n["child"] + s["adult"] ** 2 / n["adult"])
t_stat = numerator / denominator
print("The t_stat is:{}".format(t_stat))

The t_stat is:1.8699313316221844


In [7]:
# Calculate the degrees of freedom
degrees_of_freedom = n["child"] + n["adult"] - 2  # minus two means

# Calculate the p-value from the test stat. Right-tailled test.
p_value = 1 - t.cdf(t_stat, df=degrees_of_freedom) # Use t-distribution CDF not normal CDF

# Print the p-value
print("The p-value is:{:.2f}".format(p_value))

if p_value <= alpha:
    print("Reject H0 in favour of HA")
else:
    print("Failed to reject H0")

The p-value is:0.03
Reject H0 in favour of HA


### Using pingoin

In [8]:
# Calculate the p-value from the test stat. Right-tailled test.
# Unpaired test on the same data
stats_df = pingouin.ttest(x=df[df["age_first_code_cut"]=="child"]["converted_comp"],
                          y=df[df["age_first_code_cut"]=="adult"]["converted_comp"],
                          alternative="greater")

In [9]:
p_val = stats_df["p-val"]

# Print the p-value
print("The p-value is:{:.2f}".format(p_value))

if p_value <= alpha:
    print("Reject H0 in favour of HA")
else:
    print("Failed to reject H0")

The p-value is:0.03
Reject H0 in favour of HA
