In [69]:
import pandas as pd
import numpy as np
import random
import datetime

Creating the cells and questions

In [70]:
cells = []
for x in range(1,7):
    cells.append('Cell %d' % x)

In [71]:
pairs = {}
for x in cells:
    questions = []
    for i in range((int(x[-1])*4)-3,(int(x[-1])*4)+1):
        questions.append('Question %d' % i)
    pairs[x] = questions
    


In [72]:
cell_df = pd.DataFrame(pairs).T.reset_index().rename(columns={'index': 'Cell.name'})
cell_df = cell_df.melt(id_vars=['Cell.name'], value_vars=cell_df.columns[1:], value_name='Question').drop(columns=['variable']).sort_values(by=['Cell.name','Question']).reset_index(drop=True)

In [73]:
cell_df.head(10)

Unnamed: 0,Cell.name,Question
0,Cell 1,Question 1
1,Cell 1,Question 2
2,Cell 1,Question 3
3,Cell 1,Question 4
4,Cell 2,Question 5
5,Cell 2,Question 6
6,Cell 2,Question 7
7,Cell 2,Question 8
8,Cell 3,Question 10
9,Cell 3,Question 11


In [74]:
users = list(range(1,1801))
random.shuffle(users)

In [75]:
demographics = {
    'reviewer_gender': {'categories': ['Man','Woman','Non-binary','Prefer not to say'],
                'weights': [0.48,0.48,0.02,0.02]},
    'reviewer_age': {'categories': ['18-24','25-34','35-44','45-54','55-64','65+','Prefer not to say'],
            'weights': [0.15,0.25,0.2,0.2,0.1,0.05,0.05]},
    'reviewer_ethnicity': {'categories': ['White','African/Black/Black British','S.Asian','SE.Asian','Mixed/Multiple','Other','Prefer not to say'],
                  'weights': [0.7,0.1,0.05,0.05,0.05,0.03,0.02]},
    'reviewer_religion': {'categories': ['Christian','Muslim','Hindu','Buddhist','Jewish','Sikh','None','Other','Prefer not to say'],
                 'weights': [0.4,0.2,0.1,0.05,0.02,0.02,0.1,0.05,0.06]},
    'reviewer_disability': {'categories': ['Yes','No','Prefer not to say'],
                   'weights': [0.1,0.85,0.05]},
    'reviewer_sexuality': {'categories': ['Heterosexual or straight','Gay','Lesbian','Bisexual','Asexual','Prefer not to say'],
                  'weights': [0.8,0.04,0.04,0.03,0.01,0.08]},
    'reviewer_mental_illness': {'categories': ['Yes','No','Prefer not to say'],
                        'weights': [0.1,0.85,0.05]},
}

In [76]:
def generate_demographics(user_list):
    user_demographics = {}
    for x in user_list:
        user_demographics[x] = {}
        for demographic, values in demographics.items():
            user_demographics[x][demographic] = np.random.choice(values['categories'], p=values['weights'])
    return user_demographics


In [77]:
user_demo = generate_demographics(users)

In [78]:
user_df = pd.DataFrame(user_demo).T.reset_index().rename(columns={'index': 'reviewer_id'})

In [79]:
def choose_reviewers(user_id,user_list):
    reviewers = np.random.choice([x for x in user_list if x != user_id],5,replace=False).tolist()
    reviewers.append(user_id)
    return reviewers

In [80]:
def create_surveys(n_surveys, users):
    surveys = {}
    random.shuffle(cells)
    for x in range(n_surveys):
        survey = {}
        cells_selected = np.array_split(cells, n_surveys)[x].tolist()
        survey['cells'] = cells_selected
        #survey['users'] = np.array_split(users, n_surveys)[x].tolist()
        survey['users'] = users
        surveys[x] = survey
    return surveys

In [81]:
def get_reviews(survey):
    reviews = {}
    for x in survey['users']:
        reviewers = choose_reviewers(x,survey['users'])
        reviews[x] = reviewers
    return reviews
    

In [82]:
def create_populations(n_pops,n_surveys,user_list):
    population_users = {}
    for x in range(n_pops):
        population_users[x] = np.array_split(user_list, n_pops)[x].tolist()
    population_surveys = {}
    for x in range(n_pops):
        population_surveys[x] = create_surveys(n_surveys, population_users[x])
    for x in population_surveys:
        for y in population_surveys[x]:
            population_surveys[x][y]['reviews'] = get_reviews(population_surveys[x][y])
            population_surveys[x][y].pop('users')
            survey_id = x*n_surveys + y
            population_surveys[x][survey_id] = population_surveys[x].pop(y)
            
        
        
    return population_surveys
    

In [83]:
def random_date(start,end):
    delta = end - start
    delta_in_second = delta.total_seconds()
    random_second = random.randrange(int(delta_in_second))
    return start + datetime.timedelta(seconds=random_second)

In [84]:
start = datetime.datetime(2020,1,1)
end = datetime.datetime(2024,12,31)

In [85]:
def relationship(user,reviewer):
    if user == reviewer:
        return 'self'
    else:
        return np.random.choice(['other','manager'],p=[0.75,0.25])

In [86]:
def org_to_dataframe(org):
    row = []
    report_review_id = 0
    review_id = 0
    for x in org:
        pop_id = x
        for y in org[x]:
            survey_id = y
            created_at = random_date(start,end)
            for z in org[x][y]['cells']:
                cells = z
                for a in org[x][y]['reviews']:
                    user_id = a
                    for b in org[x][y]['reviews'][a]:
                        reviewers = b
                        relationshiptouser = relationship(a,b)
                        row.append([report_review_id,review_id,created_at,pop_id,
                                    survey_id,user_id,reviewers,relationshiptouser,cells])
                        report_review_id += 1
                    review_id += 1
    df = pd.DataFrame(row, columns=['report_review.id','review.id','created_at','PopID',
                                    'survey_id','user_id','reviewer_id','Relationshiptouser',
                                    'Cell.name'])
    return df

In [87]:
users[:600]

[870,
 1295,
 506,
 490,
 1792,
 1736,
 894,
 350,
 820,
 743,
 730,
 614,
 902,
 1280,
 545,
 1331,
 90,
 756,
 412,
 118,
 379,
 1615,
 586,
 1686,
 1248,
 313,
 702,
 563,
 148,
 121,
 6,
 767,
 1443,
 1227,
 361,
 512,
 1368,
 1094,
 513,
 888,
 793,
 523,
 1678,
 255,
 1014,
 76,
 1545,
 634,
 1531,
 1726,
 1600,
 1049,
 1543,
 314,
 1041,
 229,
 623,
 1597,
 1722,
 1081,
 326,
 1312,
 1743,
 850,
 761,
 847,
 413,
 1463,
 1399,
 136,
 464,
 963,
 985,
 194,
 359,
 105,
 1430,
 893,
 618,
 588,
 1159,
 1727,
 912,
 632,
 1029,
 402,
 1055,
 1657,
 907,
 1473,
 489,
 1721,
 615,
 407,
 1481,
 992,
 549,
 347,
 493,
 322,
 1195,
 1763,
 1759,
 1025,
 786,
 957,
 1523,
 1788,
 773,
 271,
 1256,
 378,
 364,
 1775,
 1270,
 1717,
 425,
 622,
 991,
 510,
 1098,
 897,
 380,
 1277,
 1037,
 499,
 517,
 311,
 1445,
 908,
 566,
 95,
 15,
 66,
 788,
 247,
 824,
 257,
 1689,
 1791,
 746,
 1254,
 641,
 13,
 683,
 1755,
 1382,
 418,
 1327,
 243,
 529,
 713,
 878,
 1235,
 1487,
 204,
 1169,
 1112,

In [88]:
def get_score(relationship):
    if relationship == 'manager':
        return np.random.choice(range(11), p=[0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.11, 0.12, 0.2, 0.24, 0.1])
    elif relationship == 'other':
        return np.random.choice(range(11), p=[0.01, 0.02, 0.03, 0.05, 0.07, 0.1, 0.13, 0.15, 0.25, 0.14, 0.05])
    else:
        return np.random.choice(range(11))

In [89]:
id_list = ['PopID','survey_id','report_review.id','review.id']

In [90]:
start_dict ={
    'PopID': 0,
    'survey_id': 0,
    'report_review.id': 0,
    'review.id': 0
}

In [91]:
def create_organisation(n_pops,n_surveys,users,org_id):
    org = create_populations(n_pops,n_surveys,users)
    org_df = org_to_dataframe(org)
    org_df = pd.merge(org_df,cell_df,on=['Cell.name'])
    org_df['original_value'] = org_df['Relationshiptouser'].apply(lambda x: get_score(x))
    org_df = pd.merge(org_df,user_df,on='reviewer_id')
    org_df.sort_values(by='report_review.id', inplace=True)
    org_df['OrgID'] = org_id
    for x in id_list:
        org_df[x] = org_df[x] + start_dict[x]
        start_dict[x] = org_df[x].max() + 1

    return org_df

In [92]:
org1 = create_organisation(3,3,users[:600],1)
start_dict

{'PopID': 3, 'survey_id': 9, 'report_review.id': 21600, 'review.id': 3600}

In [93]:
org2 = create_organisation(3,3,users[601:1200],2)
start_dict

{'PopID': 6, 'survey_id': 18, 'report_review.id': 43164, 'review.id': 7194}

In [94]:
org3 = create_organisation(3,3,users[1201:],3)
start_dict

{'PopID': 9, 'survey_id': 27, 'report_review.id': 64728, 'review.id': 10788}

In [95]:
df = pd.concat([org1,org2,org3],ignore_index=True)

In [96]:
df['created_at'] = df['created_at'].apply(lambda x: random_date(x-datetime.timedelta(days=14),x))
df['deadline'] = df['created_at'].apply(lambda x: x + datetime.timedelta(days=14))

In [97]:
df.head()

Unnamed: 0,report_review.id,review.id,created_at,PopID,survey_id,user_id,reviewer_id,Relationshiptouser,Cell.name,Question,original_value,reviewer_gender,reviewer_age,reviewer_ethnicity,reviewer_religion,reviewer_disability,reviewer_sexuality,reviewer_mental_illness,OrgID,deadline
0,0,0,2020-08-31 11:59:14,0,0,870,1722,other,Cell 5,Question 17,10,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-14 11:59:14
1,0,0,2020-09-03 13:11:26,0,0,870,1722,other,Cell 5,Question 18,9,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-17 13:11:26
2,0,0,2020-08-27 08:44:11,0,0,870,1722,other,Cell 5,Question 19,8,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-10 08:44:11
3,0,0,2020-08-26 18:46:01,0,0,870,1722,other,Cell 5,Question 20,8,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-09 18:46:01
4,1,0,2020-08-23 20:02:10,0,0,870,1759,other,Cell 5,Question 19,4,Man,25-34,White,Christian,Prefer not to say,Prefer not to say,No,1,2020-09-06 20:02:10


In [98]:
# Add patterns to the synthetic dataset

# 1. PopID/OrgID trend: Each PopID and OrgID gets a base adjustment
popid_trend = {pop: np.random.uniform(-1, 1) for pop in df['PopID'].unique()}
orgid_trend = {org: np.random.uniform(-1, 1) for org in df['OrgID'].unique()}

# 2. Cell.name and Question effect: Each cell and question gets a small adjustment
cell_effect = {cell: np.random.uniform(-1, 1) for cell in df['Cell.name'].unique()}

question_effect = {question: np.random.normal(0,0.5) for question in df['Question'].unique()}

# 3. Demographic effects (example: gender, age, ethnicity)
gender_effect = {
    'Man': 1.0,
    'Woman': 0.0,
    'Non-binary': -1.0,
    'Prefer not to say': 0.5
}
age_effect = {
    '18-24': -0.5,
    '25-34': 0.0,
    '35-44': 0.5,
    '45-54': 1.0,
    '55-64': 0.5,
    '65+': -0.5,
    'Prefer not to say': 0.0
}
ethnicity_effect = {
    'White': 0.5,
    'African/Black/Black British': -1.0,
    'S.Asian': 0.2,
    'SE.Asian': 0.2,
    'Mixed/Multiple': -0.2,
    'Other': -0.5,
    'Prefer not to say': 0.0
}

# 4. Apply all effects to create a new score column
def add_pattern(row):
    score = row['original_value']
    score += popid_trend[row['PopID']]
    score += orgid_trend[row['OrgID']]
    score += cell_effect[row['Cell.name']]
    score += question_effect[row['Question']]
    score += gender_effect.get(row['reviewer_gender'], 0)
    score += age_effect.get(row['reviewer_age'], 0)
    score += ethnicity_effect.get(row['reviewer_ethnicity'], 0)
    # Add some noise
    score += np.random.normal(0, 0.5)
    # Clamp to 0-10 and round
    return int(np.clip(round(score), 0, 10))


df['original_value'] = df.apply(add_pattern, axis=1)

In [99]:
df.head()

Unnamed: 0,report_review.id,review.id,created_at,PopID,survey_id,user_id,reviewer_id,Relationshiptouser,Cell.name,Question,original_value,reviewer_gender,reviewer_age,reviewer_ethnicity,reviewer_religion,reviewer_disability,reviewer_sexuality,reviewer_mental_illness,OrgID,deadline
0,0,0,2020-08-31 11:59:14,0,0,870,1722,other,Cell 5,Question 17,10,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-14 11:59:14
1,0,0,2020-09-03 13:11:26,0,0,870,1722,other,Cell 5,Question 18,10,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-17 13:11:26
2,0,0,2020-08-27 08:44:11,0,0,870,1722,other,Cell 5,Question 19,10,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-10 08:44:11
3,0,0,2020-08-26 18:46:01,0,0,870,1722,other,Cell 5,Question 20,10,Man,55-64,White,Christian,No,Heterosexual or straight,No,1,2020-09-09 18:46:01
4,1,0,2020-08-23 20:02:10,0,0,870,1759,other,Cell 5,Question 19,9,Man,25-34,White,Christian,Prefer not to say,Prefer not to say,No,1,2020-09-06 20:02:10


In [101]:
df.groupby(['OrgID','PopID','Cell.name','Question']).agg({'original_value': ['mean','std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,original_value,original_value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std
OrgID,PopID,Cell.name,Question,Unnamed: 4_level_2,Unnamed: 5_level_2
1,0,Cell 1,Question 1,8.535833,2.077815
1,0,Cell 1,Question 2,8.806667,1.874980
1,0,Cell 1,Question 3,8.450000,2.123187
1,0,Cell 1,Question 4,8.403333,2.114091
1,0,Cell 2,Question 5,7.435000,2.490169
...,...,...,...,...,...
3,8,Cell 5,Question 20,7.975712,2.313323
3,8,Cell 6,Question 21,7.575377,2.511619
3,8,Cell 6,Question 22,7.443886,2.457133
3,8,Cell 6,Question 23,6.781407,2.684897


In [102]:
df.to_csv('data/sample.csv',index=False)