In [85]:
import pandas as pd
import requests
import json
import time
from pprint import pprint
import os
from qualtrics_utils import QualtricsHook

In [86]:
## Instantiate class
QUALTRICS_KEY = open('.env').read().split("=")[1]
qualtrics_hook = QualtricsHook(QUALTRICS_KEY)


# List all surveys

In [87]:
surveys = qualtrics_hook.get_survey_list()
surveys_df = pd.DataFrame(surveys)

### 

In [103]:
surveys_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            257 non-null    object
 1   name          257 non-null    object
 2   ownerId       257 non-null    object
 3   lastModified  257 non-null    object
 4   creationDate  257 non-null    object
 5   isActive      257 non-null    bool  
dtypes: bool(1), object(5)
memory usage: 10.4+ KB


In [6]:
survey_df_2022 = (surveys_df
            # .query("creationDate >= '2021-01-01'")
            .loc[lambda df: df.name.str.contains("VW"), :]
            .set_index("id", drop=True)
)


survey_id_list = (survey_df_2022
            .index
            .tolist()
            )

# Fetch Survey Data

In [13]:
for i,survey_id in enumerate(survey_id_list):
    print(f"{i+1} out of {len(survey_id_list)}")
    qualtrics_hook.get_survey_data(survey_id)

1 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
2 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
3 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
4 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
5 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
6 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
7 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
8 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
9 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
10 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
11 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
12 out of 228
In Progress, check again in 30...
In Progress, check again in 30...
13 out of 228
In Progress

# Load to GSC

https://github.com/deliveryhero/datahub-airflow/blob/main/dags/gcc_service/operators/qualtrics/qualtrics_to_gcs.py

In [31]:
import os
from google.cloud import bigquery
from dateutil.parser import parse as parse_datetime
import json
from collections import OrderedDict
import ijson
import decimal

In [32]:
class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return str(o)
        return super().default(o)

In [71]:
class QualtricsToBigQuery:
    qualtrics_dump_dir: str = "qualtric_dump"
    qualtrics_to_bq: str = "qualtrics_to_bq"
    qualtrics_bad_file:str = "qualtrics_bad_files"
    survey_type:str = "global_pricing"
    file_id:str = None
    project_table_id:str = "dh-logistics-product-ops"
    dataset_id:str = "pricing"
    table_id:str = "_dl_pricing_qualtrics_survey_export"

    def __init__(self, json_credentials, project_id):
        self.json_credentials = json_credentials
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.json_credentials
        self.bq_client = bigquery.Client(project=project_id)
        self._load_job_config()

    def _process_responses(self, survey_id:str, filename:str):
        file_path = os.path.join(self.qualtrics_dump_dir, f"{filename}.json")
        save_file = os.path.join(self.qualtrics_to_bq, f"{filename}.json")

        with open(save_file, 'w', encoding="utf8") as current_file:

            json_decoded = ijson.items(open(file_path, "rb"), 'responses.item')

            counter = 0
            for row in json_decoded:
                created_at = parse_datetime(row["values"]["startDate"])
                try:
                    payload = json.dumps(row, ensure_ascii=False, cls=JSONEncoder)
                except Exception as e:
                    print(f"Failed to parse: {row}")
                    raise e

                json_line = OrderedDict({
                    "survey_id": survey_id,
                    "survey_type": self.survey_type,
                    "response_id": row["responseId"],
                    "file_id": self.file_id,
                    "created_date": created_at.strftime("%Y-%m-%d"),
                    "created_at": created_at.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                    "payload": payload,
                })
                current_file.write(json.dumps(json_line, ensure_ascii=False) + "\n")
                counter += 1

        # load file to bq
        if counter > 1:
            print(f"Received {counter} records.")
            self._load_to_bq(save_file)
        else:
            bad_file_path = os.path.join(self.qualtrics_bad_file, f"{filename}.json")
            print(f"Skipping downstream tasks because the survey does not contain records.")
            # print(save_file)
            os.rename(save_file, bad_file_path)
        
    
    def _load_job_config(self):
        self.job_config = bigquery.LoadJobConfig()
        self.job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        self.job_config.autodetect = True
        self.job_config.create_disposition = "CREATE_IF_NEEDED"
        self.job_config.write_disposition = "WRITE_APPEND"

    def _load_to_bq(self, file_path):
        # file_path = os.path.join(self.qualtrics_to_bq, f"{filename}.json")
        with open(file_path, "rb") as source_file:
            job = self.bq_client.load_table_from_file(
                source_file,
                f"{self.project_table_id}.{self.dataset_id}.{self.table_id}",
                job_config=self.job_config,
            )
        job.result()
        print("Loaded {} rows into {}:{}.".format(job.output_rows, self.dataset_id, self.table_id))
                

    def execute(self, survey_id, survey_filename):
        self._process_responses(survey_id, survey_filename)


In [72]:
survey_map = (
    survey_df_2022
    ["name"]
    .to_dict()
)

test_survey_id = 'SV_0Gmk2jVLdnKGKZ7'

qualtrics_operator = QualtricsToBigQuery(
    project_id="logistics-data-staging-flat"
    , json_credentials="/Users/s.lafaurie/.config/gcloud/application_default_credentials.json"
)

# qualtrics_op.execute(test_survey_id, survey_map[test_survey_id])



In [76]:
# remove files already loaded
survey_map_filtered = {x:y for x,y in survey_map.items() if f'{y}.json' not in os.listdir("qualtrics_to_bq")}
len_surveys = len(survey_map_filtered.keys())
len_surveys

34

In [77]:
i=1
for survey_id, filename in survey_map_filtered.items():
    print(f"loading {survey_id}, {filename}, {i} of {len_surveys}")
    qualtrics_operator.execute(survey_id, filename)
    i += 1

loading SV_0TfeDHfvx25AtkG, DF VW UAE Talabat Shops Feb'22, 1 of 34
Skipping downstream tasks because the survey does not contain records.
qualtrics_to_bq/DF VW UAE Talabat Shops Feb'22.json
loading SV_2cpx6wZn3G6lSzs, VW Pricing Template Q2 2022, 2 of 34
Skipping downstream tasks because the survey does not contain records.
qualtrics_to_bq/VW Pricing Template Q2 2022.json
loading SV_3IeqSWazFc7LZxc, DF VW Restaurants Saudi Arabia Template, 3 of 34
Skipping downstream tasks because the survey does not contain records.
qualtrics_to_bq/DF VW Restaurants Saudi Arabia Template.json
loading SV_3jDT8odisPxSUlw, DF VW South Korea Food Panda Aug'21_Input Box, 4 of 34
Skipping downstream tasks because the survey does not contain records.
qualtrics_to_bq/DF VW South Korea Food Panda Aug'21_Input Box.json
loading SV_4YfYMhsXBZbqI17, DF VW Panama Dec'20, 5 of 34
Skipping downstream tasks because the survey does not contain records.
qualtrics_to_bq/DF VW Panama Dec'20.json
loading SV_5oLIGRKhUfVgvj

#  Qualtrics DL to CL cleaning

In [104]:
survey_names_in_bq = [ x.split(".")[0] for x in os.listdir("qualtrics_to_bq")]
(
    surveys_df
    [surveys_df.name.isin(survey_names_in_bq)]
    .sort_values(by="creationDate")
    .get(["id", "name"])
    .to_csv("survey_ids_in_qualtrics.csv", index=False)
)


# surveys_df