#### Import modules

In [1]:
import numpy as np  # Matrix  manipulation
import os  # Filesystem support
import pandas as pd  # Dataframe handling
import Levenshtein  # Edit distance calculations

#### Set project folders

In [2]:
PROJECT_FOLDER = os.path.dirname(os.path.dirname(os.getcwd()))
PROCESSED_DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data', 'processed')
FINAL_DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data', 'final')

#### Pandas settings 

In [3]:
pd.set_option("display.precision", 3)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.max_rows", 50)

#### Load dataframe

In [4]:
DATA = os.path.join(
    PROCESSED_DATA_FOLDER, 
    'experiment_2',
    'data_processed.feather'
)

df = pd.read_feather(DATA)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1799 entries, 0 to 1798
Columns: 8322 entries, Start Date to Id
dtypes: float64(7372), int64(18), object(932)
memory usage: 114.2+ MB


#### Drop timing variables and rename column headers

In [5]:
to_drop = [
    'First Click',
    'Last Click',
    'Click Count',
]

columns_to_drop = [x for x in df.columns for y in to_drop if y in x]
df = df.drop(columns_to_drop, axis=1)\
    .rename(lambda x: x.split(' - ')[0], axis=1)\
    .rename(lambda x: x.replace('_Page Submit', '_timer'), axis=1)

#### Merge colums with consent answer for each group & drop columns

In [6]:
consent_cols = [f'G{x}Q1' for x in range(1, 16, 1)]
df['consent'] = df[consent_cols].sum(axis=1)
df = df.drop(consent_cols, axis=1)

#### Merge columns with submitted fragments for each group into one column & drop columns  

In [7]:
cols = {
    f'Q{x}_answer': [f'G{y}Q{z}' for y in range(1, 16, 1)]
    for x, z in zip(range(1, 111, 1), range(5, 442, 4))
}

for quest, columns in cols.items():
    df.loc[:, quest] = df.loc[:, columns].fillna('').sum(axis=1)
    df = df.drop(columns, axis=1)

#### Replace empty strings with nan values

In [8]:
df.loc[:, cols.keys()] = df.loc[:, cols.keys()].replace({'': np.nan})

#### Display dataframe section with submitted fragments (first five questions only)

In [9]:
df.loc[0:10, list(cols.keys())[:5]]

Unnamed: 0,Q1_answer,Q2_answer,Q3_answer,Q4_answer,Q5_answer
0,"July 20th, 1910.",in answer to cable in regard to option on Lane...,Replying to your favor of the 17th:,one would not interest him. He thanks you for...,"London, S, W., England."
1,"15 Old Bond Street,","Rembrandt purchased for me shipped Paris, see ...",have taken liberties. Have written.,"volume 2, 13, April 1914, edited by Wilhelm R....","New York, June 10, 1909."
2,"""Purchase Brower"".",then advise you definitely. Please reply to me...,ing his attention to the objecte.,They were purchased by the Duke of Bridgewater...,"Mrs. Frances W. Marshall,"
3,"December 19, 1906.",Replying to your favor of the 18th. It gives m...,"per voucher enclosed, and oblige,","He, however, would be glad to hear from you re...","Fry care Kneedler, Paris."
4,"23 Place Vendome,",Express and have it delivered to me at Trinity...,"With the Compliments of the Season,",seventy-three hundred and twenty (7320) shares...,"Prides Crossing, Mass."
5,"23 Place Vendome,",Express and have it delivered to me at Trinity...,"With the Compliments of the Season,",seventy-three hundred and twenty (7320) shares...,"Prides Crossing, Mass."
6,"Mr. John W. Beatty,","purchased., some time ago, the portrait of Sir...",high in fact for me to entertain.,"information came to me very confidentially, an...",francs. Rest soon. Write.
7,"645 Fifth Avenue,",to purchase anything unless it ranks with the ...,difficulty in finding at La Sarras.,"dining room, and would be willing to pay you $...",ed from time to time.
8,"25 Cld Bond Street,","of the 18th, that the titles of the pictures a...",Thanks not in market at present.,I enclose herewith. I have replied to the gent...,"H. Silva White. Esq.,"
9,"August 30, 1910.","As to their value, I am without any data to go...","Mesers. C. D. Barney & Company ,","for it. However, it is not a very important ma...","Always your sincerely,"


#### Count number of submitted fragments (nb: the last submission is not counted)) 

In [10]:
df['Counter'] = df['fragments'] - 1

#### Count non-empty submissions 

In [11]:
def count_non_empty(row: pd.Series) -> int:
    """
    Count actual number of submitted fragments.
    
    This function counts only non-empty submissions. 
    It takes account of the fact that workers
    skip fragments.
    """
    raw_submits = row['Counter']
    answer_columns = [f'Q{x}_answer' for x in range(1, raw_submits + 1)]
    return row[answer_columns].count()

df['Counter_real'] = df.apply(count_non_empty, axis=1)

#### Generate columns with counter for emty submissions and share of empty submissions

In [12]:
df['Counter_empty'] = df['Counter'] - df['Counter_real']
df['Counter_quota'] = df['Counter_empty'] / df['Counter']

#### Merge columns with intervention timer and drop columns  

In [13]:
intervent_cols = [
    'Q4_timer',
    'Q7_timer',
    'Q10_timer',
    'Q13_timer',
]

df['Intervention_timer'] = df[intervent_cols].sum(axis=1)
df = df.drop(intervent_cols, axis=1)

#### Merge columns with submission timer for each group

In [14]:
timer_cols = {
    f'Q{x}_timer': [f'G{y}Q{z}_timer' for y in range(1, 16, 1)]
    for x, z in zip(range(1, 111, 1), range(2, 439, 4))
}

for quest, columns in timer_cols.items():
    df.loc[:, quest] = df.loc[:, columns].sum(axis=1)
    df = df.drop(columns, axis=1)

#### Replace null timer with nan

In [15]:
df.loc[:, timer_cols.keys()] = df.loc[:, timer_cols.keys()].replace({0.0:np.nan})

#### Typecast columns with timing info to datetime objects

In [16]:
timseries = ['Start Date', 'End Date']
for series in timseries:
    df[series] = pd.to_datetime(df[series], infer_datetime_format=True)

df['duration_survey'] = (df['End Date'] - df['Start Date']).dt.seconds
df['duration_work'] = df['end_time'] - df['start_time']

#### Identify subjects for which the timer did not work (i.e. 10 min. +/- 5%)

In [17]:
low_limit = (df['duration_work'] < 600 * 0.95)
up_limit = (df['duration_work'] > 600 * 1.05)
broken_timer = (low_limit| up_limit)

broken_columns = ['treatment', 'duration_work', 'fragments']
df_broken = df.loc[broken_timer, broken_columns]\
    .sort_values(by='duration_work')\
    .rename(lambda x: x.capitalize().replace('_', ' '), axis=1)

df_broken

Unnamed: 0,Treatment,Duration work,Fragments
475,4,632,58
81,3,633,39
1559,3,635,19
891,1,636,6
1348,2,638,34
290,4,641,54
184,4,642,79
1015,2,655,54
743,4,660,55
782,3,679,26


#### Remove subjects for which the timer did not work

In [18]:
df = df[~broken_timer]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1768 entries, 0 to 1798
Columns: 264 entries, Start Date to duration_work
dtypes: datetime64[ns](2), float64(152), int64(21), object(89)
memory usage: 3.6+ MB


#### Make column with treatment names

In [19]:
treatment_dict = {
    1: 'Neutral',
    2: 'Charisma without goal',
    3: 'Goal',
    4: 'Full charisma'
}

df['Treatment_str'] = df['treatment'].replace(treatment_dict)

#### Make columns with binary variable for main treatment dimensions

In [20]:
df['Charisma'] = df['treatment'].isin([2, 4]).astype(int)
df['Goal'] = df['treatment'].isin([3, 4]).astype(int)

#### Rename columns with background questions

In [21]:
var_dict = {
    'Q17': 'Enjoyment',
    'Q18': 'Motivation',
    'Q19': 'Future_motivation',
    'Q20': 'Frick_knowledge',
    'Q21': 'Identify_mission',
    'Q22': 'Shedule_change',
    'Q23': 'Time_mturk_increase',
    'Q24': 'Hours_mturk',
    'Q25': 'Concentration',
    'Q26': 'Mobile_device',
    'Q27': 'Touch_typing',
    'Q28': 'Hours_keyboard',
    'Q29': 'Hours_keyboard_total',
    'Q30': 'Age',
    'Q31': 'Gender',
    'Q32': 'Education',
    'Q33': 'Comments',
    'Q1_browser': 'Browser',
    'Q1_operating system': 'Operating_system',
    'Q1_resolution': 'Resolution',
    'Q1_version': 'Version',
}

df = df\
    .rename(var_dict, axis=1)\
    .rename(lambda x: x.capitalize(), axis=1)

#### Generate columns with indicator variable for different genders

In [22]:
df[['Male', 'Female', 'Diverse']] = pd.get_dummies(df['Gender'])

#### Drop uneccessary columns

In [23]:
df = df.drop(df.columns[7:11], axis=1)

#### Remove leading/trailing whitespace and linebreaks

In [24]:
def remove_whitespace_and_breaks(x):
    if isinstance(x, str):
        return x.strip().replace('\n', ' ').replace('\r', '')
    else:
        return x

df = df.applymap(remove_whitespace_and_breaks)

#### Make dataframe with empty columns for edit distance and error_share, then join with main dataframe

In [25]:
edit_cols = [
    col for x in range(1, 111)
    for col in (f'Q{x}_solution', f'Q{x}_distance', f'Q{x}_editratio')
]

edit_frame = pd.DataFrame(index=df.index, columns=edit_cols)
df = df.merge(edit_frame, left_index=True, right_index=True)

#### Load dataframe with correct solutions

In [26]:
path = os.path.join(PROCESSED_DATA_FOLDER, 'experiment_2', 'fragments.xlsx')
df_solutions = pd.read_excel(path, index_col=0, engine='openpyxl')
df_solutions.head()

Unnamed: 0,Solutions_group_1,Solutions_group_2,Solutions_group_3,Solutions_group_4,Solutions_group_5,Solutions_group_6,Solutions_group_7,Solutions_group_8,Solutions_group_9,Solutions_group_10,Solutions_group_11,Solutions_group_12,Solutions_group_13,Solutions_group_14,Solutions_group_15
Q1_solution,"September 12, 1905.","15 Old Bond Street,","December 19, 1906.","23 Place Vendome,","(about June 3, 1909.","645 Fifth Avenue,","January 10, 1910.","January 10, 1910.","Mr. John W. Beatty,","January 24, 1910.","""Purchase Brower"".","July 20th, 1910.","25 Old Bond Street,","July 21st, 1910.","August 30, 1910."
Q2_solution,"You did not enclose bill for premium, but as w...","Rembrandt purchased for me shipped Paris, see ...",Replying to your favor of the 18th. It gives m...,Express and have it delivered to me at Trinity...,"an introduction to Mademoiselle Ogiz, whom you...",to purchase anything unless it ranks with the ...,has no objections to the reproduction of the R...,Your note of the 17th is a surprise. I do not ...,"purchased, some time ago, the portrait of Sir ...",tioned is certainly very high. I probably will...,then advise you definitely. Please reply to me...,in answer to cable in ragard to option on Lane...,"of the 18th, that the titles of the pictures a...",I enclose you a memorandum of the transaction ...,"As to their value, I am without any data to go..."
Q3_solution,"""Mussel Gatherer"" would seem fair.",have taken liberties. Have written.,"per voucher enclosed, and oblige,","With the Compliments of the Season,","Mr. T. Jefferson Coolidge, Jr.,",difficulty in finding at La Sarras.,matter to him when he sees him.,"Thanking you for your trouble, I am",high in fact for me to entertain.,I am in receipt of your favor of,ing his attention to the objects.,Replying to your favor of the 17th:,"""Thanks not in market at present.",Senora Dona Maria Martinez da Puga.,"Messrs. C. D. Barney & Company,"
Q4_solution,interest us but you did not name a price; I pr...,"volume 2, 13, April 1914, edited by Wilhelm R....","He, however, would be glad to hear from you re...",seventy-three hundred and twenty (7320) shares...,"22d ultimo, that he is always in the market fo...","dining room, and would be willing to pay you $...","on the third floor, and would be glad if you w...","sent him, and acknowledge with thanks the book...","information came to me very confidentially, an...",him that if he ever did think of selling any o...,They were purchased by the Duke of Bridgewater...,one would not interest him. He thanks you for ...,I enclose herewith. I have replied to the gent...,three hundred was not legitimate but made spoi...,"for it. However, it is not a very important ma..."
Q5_solution,"To Mr. H. H. Farjeon,","New York, June 10, 1909.","Fry care Knoedler, Paris.","Prides Crossing, Mass.","Prides Crossing, Mass.",ad from time to time.,will bear good fruit.,My dear Mr. Burroughs:,francs. Rest soon. Write.,(Signed) R.E.Griswold.,"Mrs. Frances W. Marshall,","London, S. W., England.","H. Silva White, Esq.,",we know what he thinks.,"Always yours sincerely,"


#### Make columns with correct solutions

In [27]:
for group in range(1, 16, 1):
    for index, row in df_solutions.iterrows():
        df.loc[df['Group'] == group, index] = row[f'Solutions_group_{group}']

#### Calculate Levenshtein edit distance 

In [28]:
column_pairs = [(f'Q{x}_answer', f'Q{x}_solution') for x in range(1, 111)]

for index, row in df.iterrows():
    number_answers_submitted = row.at['Counter']
    columns_to_check = column_pairs[:number_answers_submitted]

    for number, (answer_col, solution_col) in enumerate(columns_to_check):
        answer = row[answer_col]
        solution = row[solution_col]

        if isinstance(answer, str):
            edit_distance = Levenshtein.distance(answer, solution)
            edit_ratio = edit_distance / max(len(answer), len(solution))

        else:
            edit_distance = np.NAN
            edit_ratio = np.NAN

        dist_column = f'Q{number + 1}_distance'
        ratio_column = f'Q{number + 1}_editratio'

        df.at[index, dist_column] = edit_distance
        df.at[index, ratio_column] = edit_ratio

#### Manually inspecting answers and evaluations for submitted fragments

In [29]:
x = 33
cols = [
    f'Q{x}_answer', 
    f'Q{x}_solution', 
    f'Q{x}_distance',
    f'Q{x}_editratio', 
    f'Q{x}_timer',
    'Counter'
]

df.loc[(df['Counter'] >= x), cols].head(15)

Unnamed: 0,Q33_answer,Q33_solution,Q33_distance,Q33_editratio,Q33_timer,Counter
0,"251 West 81st Street, N. Y.","251 West 81st Street, N. Y.",0,0.0,8.196,59
1,Messrs. M. Knoedler & Company.,Messrs. M. Knoedler & Company.,0,0.0,33.114,42
2,Messers. N. Knoedler & Company.,"Mesers. N. Knoedler & Company,",2,0.0645,12.926,53
3,I will be greatly obliged.,I will be greatly obliged.,0,0.0,7.955,35
4,great faith in your judgement.,great faith in your judgment.,1,0.0333,5.399,41
5,great faith in your judgement.,great faith in your judgment.,1,0.0333,9.133,35
6,"29, Inks Street, 3t. James's,","29, Duke Street, St. James's,",4,0.138,15.729,38
7,"636 Fifth Avenue, New York.","636 Fifth Avenue, New York.",0,0.0,15.09,48
9,referred to in your favor.,referred to in your favor.,0,0.0,7.07,41
10,"Metropolitan Museum of Art,","Metropolitan Museum of Art,",0,0.0,10.574,35


#### Order dataframe columns

In [30]:
cols = [
    'Q{0}_answer',
    'Q{0}_solution',
    'Q{0}_distance',
    'Q{0}_editratio',
    'Q{0}_timer',
]

ques_cols = [elem.format(x) for x in range(1, 111) for elem in cols]
main = list(df.columns.difference(ques_cols))
df = df.reindex(main + ques_cols, axis=1)

#### Save dataframe

In [31]:
folder = os.path.join(FINAL_DATA_FOLDER, 'experiment_2')
df.reset_index(drop=True).to_feather(os.path.join(folder, 'data_final.feather'))
df.to_csv(os.path.join(folder, 'data_final.csv'))

#### Convert to html

In [32]:
!jupyter nbconvert --output-dir='./docs' --to html 1_make_dataset.ipynb

[NbConvertApp] Converting notebook 1_make_dataset.ipynb to html
[NbConvertApp] Writing 656391 bytes to docs/1_make_dataset.html
