# Atlas data pipeline

In [None]:
from_drive=True

In [None]:
## GIT ON COLAB ONLY
try:
    from google.colab import userdata
    git_token = userdata.get('gitToken')
    git_user = userdata.get('gitUser')
    git_url = f'https://{git_token}@github.com/rene-aum/Atlas.git'
    branch_to_pull = 'dev'
    !git clone {git_url}
    %cd Atlas
    !git checkout -b colab_branch
    !git pull origin {branch_to_pull}
    !pip install -r PipelinesConsumo/src/requirements.txt
    %cd PipelinesConsumo
except Exception as e:
    print(e)
    print('Running in other environment not colab probably!')

In [None]:
import sys
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import pytz
# from matplotlib.ticker import FuncFormatter
from datetime import datetime, timedelta
import warnings
import sys
sys.path.append('..')
sys.path.append('../..')
from utils.utils import (get_dates_dataframe,
                       add_year_week,
                       custom_read,
                       process_columns)
from PipelinesConsumo.src.rawAtlas import RawAtlas
from PipelinesConsumo.src.processedAtlas import ProcessedAtlas
from src.transformed import Transformed
from utils.drive_toolbox import(from_drive_to_local,
                             get_last_modification_date_drive,
                             create_sheets_in_drive_folder,
                             update_sheets_in_drive_folder,
                             read_from_google_sheets,
                             list_file_ids_for_drive_folder,
                             create_csv_file_in_drive_folder,
                             write_csv_to_drive,
                             read_csv_from_drive)
from src.constants import (atlas_raw_output_folder_id,
                           data_source_folder_id,
                           raw_output_ids,
                           folder_id_bauto_gabo,
                           id_reporte_ventas,
                           )


warnings.filterwarnings('ignore')



In [None]:
if from_drive:
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    import gspread
    from google.auth import default
    from gspread_dataframe import set_with_dataframe
    import gdown
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    ######################################################### get ids per file ###############################################################
    file_id_dict = list_file_ids_for_drive_folder(drive,data_source_folder_id)

    id_dict = {}
    for k in file_id_dict.keys():
        id_dict[k] = {'id':file_id_dict.get(k),
                      'local_name':(k.strip()
                                    .lower()
                                    .replace(" - ", "_")
                                    .replace(" ", "_")
                                    .replace("-", "_"))
                    }
    print(f'Nombres archivos locales:\n\n{"\n".join([id_dict[k]['local_name'] for k in id_dict ])}')
    ##################################################### get all files to local filesystem ###################################################
    for k in id_dict.keys():
        from_drive_to_local(drive, id_dict.get(k).get('id'), id_dict.get(k).get('local_name'))
    ###########################################################################################################################################
    data_date = get_last_modification_date_drive(drive,file_id_dict['Data Warehouse - Productivo.xlsx'])
    ###########################################################################################################################################
    creds, _ = default()
    gc = gspread.authorize(creds)

## Raw Pipeline

In [None]:
ra = RawAtlas()

In [None]:
custom_methods = [attr for attr in dir(ra) if not attr.startswith('__') and callable(getattr(ra, attr))]
print(f'Metodos:\n{"\n".join(custom_methods)}')

In [None]:
t1 = ra.t1_raw_vehicle_status(excel_path = 'data_warehouse_productivo.xlsx', excel_tab_name='Vehicle Status')
t2 = ra.t2_raw_pedidos(excel_path='data_warehouse_productivo.xlsx', excel_tab_name='Pedidos')
t3 = ra.t3_raw_clientes(excel_path='data_warehouse_productivo.xlsx', excel_tab_name='Clientes')
t4 = ra.t4_raw_appstep(csv_path='channelappstep.csv')
t5 = ra.t5_raw_unique_visitors(csv_path='uniquevisitorsadobe.csv')
t6 = ra.t6_raw_product_views(csv_path='pdp_hist.csv')
t7 = ra.t7_raw_cancelaciones(csv_path='vs_cancellation_log.csv')
t8 = ra.t8_raw_cta_adobe(csv_path="ctaadobe.csv")
t9 = ra.t9_raw_consolidado_bauto(drive,gc,folder_id_bauto_gabo)

In [None]:
write_csv_to_drive(drive, raw_output_ids.get('t1_RawVehicleStatus'), t1)
write_csv_to_drive(drive, raw_output_ids.get('t2_RawPedidos'), t2)
write_csv_to_drive(drive, raw_output_ids.get('t3_RawClientes'), t3)
write_csv_to_drive(drive, raw_output_ids.get('t4_RawAppstep'), t4)
write_csv_to_drive(drive, raw_output_ids.get('t5_RawUniqueVisitors'), t5)
write_csv_to_drive(drive, raw_output_ids.get('t6_RawProductViews'), t6)
write_csv_to_drive(drive, raw_output_ids.get('t7_RawCancelaciones'), t7)
write_csv_to_drive(drive, raw_output_ids.get('t8_RawCtaAdobe'), t8)
write_csv_to_drive(drive, raw_output_ids.get('t9_RawConsolidadBaseTotalAutomarket'), t9)

## Processed

In [None]:
pa = ProcessedAtlas()

In [None]:
# load raw
RawVehicleStatus = read_csv_from_drive(drive, raw_output_ids.get('t1_RawVehicleStatus'))
RawPedidos = read_csv_from_drive(drive, raw_output_ids.get('t2_RawPedidos'))
RawClientes = read_csv_from_drive(drive, raw_output_ids.get('t3_RawClientes'))
RawAppstep = read_csv_from_drive(drive, raw_output_ids.get('t4_RawAppstep'))

In [None]:
tp1 = pa.proc_publicaciones(rawdf=RawVehicleStatus)
tp2 = pa.proc_pedidos(rawdf=RawPedidos)
tp3 = pa.proc_clientes(rawdf=RawClientes)
tp4 = pa.proc_adobe_funnel_comprador(rawdf=RawAppstep, tipo='total')
tp5 = pa.proc_adobe_funnel_comprador(rawdf=RawAppstep, tipo='usuario')
tp6 = pa.proc_adobe_funnel_vendedor(rawdf=RawAppstep, tipo='total')
tp7 = pa.proc_adobe_funnel_vendedor(rawdf=RawAppstep, tipo='usuario')