In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

sys.path.insert(0, os.path.abspath('../'))

from config import SRC, MOCK_DATA
from src.conjoint.utilities import read_yaml

In [142]:
conjoint = pd.read_csv(MOCK_DATA / "raw_data_mock.csv", encoding='cp1252')

specs = read_yaml(MOCK_DATA / "specs.yaml")
renaming_specs = read_yaml(MOCK_DATA / "renaming_replacing.yaml")

In [10]:
RAW_FILES

({'raw_data': WindowsPath('C:/Users/sjurl/OneDrive/Desktop/MasterThesis/Analysis/conjoint/src/mock_data/raw_data.csv'),
  'mock_data': WindowsPath('C:/Users/sjurl/OneDrive/Desktop/MasterThesis/Analysis/conjoint/src/conjoint/data/mock_data/raw_data_mock.csv'),
  'specs': WindowsPath('C:/Users/sjurl/OneDrive/Desktop/MasterThesis/Analysis/conjoint/src/data_management/specs.yaml'),
  'renaming_replacing': WindowsPath('C:/Users/sjurl/OneDrive/Desktop/MasterThesis/Analysis/conjoint/src/data_management/renaming_replacing.yaml')},)

In [7]:
RAW_FILES ={
        "raw_data": SRC / "mock_data" / "raw_data.csv",
        "mock_data" : MOCK_DATA / "raw_data_mock.csv", 
        "specs": SRC / "data_management" / "specs.yaml",
        "renaming_replacing" : SRC / "data_management" / "renaming_replacing.yaml",
    },

#conjoint = pd.read_csv(RAW_FILES["mock_data"], encoding='cp1252')

In [143]:
def clean_data(df, specs, renaming_specs):

    # Initial Cleaning
    df = df.drop([0, 1])
    
    df = df.replace(renaming_specs['utility'])
    for category in list(renaming_specs['attributes'].keys()):
        df = df.replace(renaming_specs['attributes'][category])

    # Keep Variables
    variable_specs = specs["variables"]
    groups_of_vars = variable_specs.keys()
    vars_to_keep=[]
    for group in groups_of_vars:
        vars_to_keep += variable_specs[group]["names"]

    df = df[vars_to_keep]

    # Transform types
    for group in groups_of_vars:
        if variable_specs[group]["type"] == 'categorical':
            df[variable_specs[group]["names"]] = df[variable_specs[group]["names"]].astype('category')

        elif variable_specs[group]["type"] == 'numerical':
            df[variable_specs[group]["names"]] = df[variable_specs[group]["names"]].astype('int')

        else:
            continue

    
    df['ID'] = range(1, len(df) + 1)
    #df = df.set_index("ID")

    # Add inconsistency indicator
    df = _inconsistency(df)

    return df

def _inconsistency(df):
    """Looks for inconsistency between preferred package and  choices and likert rating"""
    
    for round in range(1,7):
        df[f'likert_choice_{round}_A'] = df[f'likert_{round}_1'] >= df[f'likert_{round}_2']
        df[f'likert_choice_{round}_B'] = df[f'likert_{round}_1'] <= df[f'likert_{round}_2']
    
        df[f'inconsistency_{round}'] = ((df[f'likert_choice_{round}_A'] == 0) & (df[f'choice_set_{round}'] == 'A')) | ((df[f'likert_choice_{round}_B'] == 0) & (df[f'choice_set_{round}'] == 'B'))
    
    return df

def make_long(df):
    """Transforms the wide-format survey daya into a long-format DataFrame with repeated measures.

    This function takes the wide format from the raw survey data, where each row represents a participant
    and each column corresponds different settings for the different choice sets across multiple rounds. 
    It converts the DataFrame into a long format, with each row representing an individual choice round, 
    associated with the participant and the round.

    Parameters:
        df (pandas.DataFrame): The input DataFrame in wide format with participant data and multiple rounds.

    Returns:
        pandas.DataFrame: A long-format DataFrame.

    """

    long_df = pd.DataFrame()
    for round in range(1,7):
        df_temp = df[['ID',
                      f'round_{round}_att_1_a', 
                      f'round_{round}_att_1_b',
                      f'round_{round}_att_2_a', 
                      f'round_{round}_att_2_b', 
                      f'round_{round}_att_3_a', 
                      f'round_{round}_att_3_b', 
                      f'round_{round}_att_4_a', 
                      f'round_{round}_att_4_b', 
                      f'round_{round}_att_5_a', 
                      f'round_{round}_att_5_b', 
#                      f'round_{round}_att_6_a', 
#                      f'round_{round}_att_6_b',  
                      f'choice_set_{round}', 
                      f'likert_{round}_1', 
                      f'likert_{round}_2',
                      f'inconsistency_{round}']]
        df_temp['round'] = round
        df_temp = df_temp.rename(columns={
            f'round_{round}_att_1_a' : 'att_1_A', 
            f'round_{round}_att_1_b' : 'att_1_B',
            f'round_{round}_att_2_a' : 'att_2_A', 
            f'round_{round}_att_2_b' : 'att_2_B', 
            f'round_{round}_att_3_a' : 'att_3_A', 
            f'round_{round}_att_3_b' : 'att_3_B', 
            f'round_{round}_att_4_a' : 'att_4_A', 
            f'round_{round}_att_4_b' : 'att_4_B', 
            f'round_{round}_att_5_a' : 'att_5_A', 
            f'round_{round}_att_5_b' : 'att_5_B', 
#            f'round_{round}_att_6_a' : 'att_6_A', 
#            f'round_{round}_att_6_b' : 'att_6_B', 
            f'choice_set_{round}' : 'choice', 
            f'likert_{round}_1' : 'utility_A',
            f'likert_{round}_2' : 'utility_B',
            f'inconsistency_{round}' :  'inconsistent'
        })
        long_df = pd.concat([long_df, df_temp])

    first_columns = ['ID', 'round']
    all_cols = long_df.columns

    new_order = first_columns + [c for c in all_cols if c not in first_columns]

    long_df = long_df[new_order]
    long_df = long_df.set_index(['ID', 'round'])
    long_df = long_df.sort_index()
    
    return long_df

In [144]:
conjoint = clean_data(conjoint, specs, renaming_specs)

#### Make long format

In [145]:
conjoint_long = make_long(conjoint)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['round'] = round
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['round'] = round
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['round'] = round
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [146]:
conjoint_long

Unnamed: 0_level_0,Unnamed: 1_level_0,att_1_A,att_1_B,att_2_A,att_2_B,att_3_A,att_3_B,att_4_A,att_4_B,att_5_A,att_5_B,choice,utility_A,utility_B,inconsistent
ID,round,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1,Stop&Reduce,StatusQuo,LowInvestment,HighInvestment&Int&Consideration,LowPrices,EnergyAccess,EarlyPension,CreateJobs,CivilNGO,LabourUnion,B,4,5,False
1,2,PhaseOut,StatusQuo,HighInvestment&Int,LowInvestment&LowConsideration,Nothing,EnergyAccess,JobGuarantee,JobGuarantee,EnergySector,Media,A,5,3,False
1,3,StatusQuo,Stop&Maintain,LowInvestment,LowInvestment&LowConsideration,Transfers,Nothing,Nothing,Retrain,CivilNGO,CentralGov,A,6,4,False
1,4,Stop&Maintain,Stop&Reduce,HighInvestment&Int&Consideration,HighInvestment&Int&Consideration,Nothing,Nothing,CreateJobs,Retrain,Media,CentralGov,A,5,2,False
1,5,StatusQuo,Stop&Reduce,LowInvestment,HighInvestment&Int,Transfers,EnergyAccess,EarlyPension,EarlyPension,CivilNGO,LabourUnion,B,4,6,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,2,Stop&Reduce,Stop&Reduce,HighInvestment&Int&Consideration,HighInvestment&Int&Consideration,EnergyAccess,Nothing,Nothing,Nothing,Researchers,LabourUnion,B,3,5,False
11,3,Stop&Reduce,Stop&Maintain,LowInvestment,HighInvestment&Int&Consideration,Transfers,LowPrices,CreateJobs,CreateJobs,LabourUnion,CentralGov,A,6,4,False
11,4,PhaseOut,Stop&Reduce,HighInvestment&Int&Consideration,HighInvestment&Int&Consideration,Transfers,Transfers,Nothing,JobGuarantee,LocalGov,Researchers,B,7,7,False
11,5,Stop&Maintain,Stop&Reduce,HighInvestment&Int&Consideration,HighInvestment&Int&Consideration,Transfers,EnergyAccess,Nothing,Retrain,CivilNGO,LabourUnion,A,7,6,False
