### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np
import pandas as pd
import json
import codecs
import os
import sys
import datetime as dt
sys.path.append('../')
import oci
import ocifs

In [34]:
from generate_data_extraction.features_functions import *
from generate_data_extraction.reduce_feature_list import *
from generate_data_extraction.common_columns_lists import *

In [3]:
def read2df(path):
    df = pd.read_csv(path, low_memory=False)
    return df    

In [4]:
bucket_name = "tenen-raw-data"
config = oci.config.from_file(
        "~/.apikey/config",
        "DEFAULT")
object_storage_client = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage_client.get_namespace().data
namespace

'lrhwvlagph9z'

In [5]:
hl_file = {'oct' : 'tenen_high_lvl_oct.csv', 'nov': 'tenen_high_lvl_nov.csv', 'dec': 'tenen_high_lvl_dec.csv' }
hits_file = {'oct' : 'tenen_hits_extend_oct.csv', 'nov': 'tenen_hits_extend_nov.csv', 'dec': 'tenen_hits_extend_dec.csv'}

In [16]:
fname = hl_file['dec']

In [17]:
directory = 'products'
config = {"config": "~/.apikey/config"}
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/products/tenen_high_lvl_dec.csv'

In [11]:
#namespace = "lrhwvlagph9z"
df_tl_oct = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)

In [14]:
df_tl_nov = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)

In [18]:
df_tl_dec = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)

In [54]:
df_tl_oct['fullVisitorId'] = df_tl_oct['fullVisitorId'].astype(str)
df_tl_nov['fullVisitorId'] = df_tl_nov['fullVisitorId'].astype(str)
df_tl_dec['fullVisitorId'] = df_tl_dec['fullVisitorId'].astype(str)

In [20]:
df_list = [df_tl_oct, df_tl_nov, df_tl_dec]

In [55]:
df = pd.concat(df_list, ignore_index=True)

In [56]:
df.shape

(241188, 65)

In [60]:
numeric_cols = ['totals.hits', 'totals.pageviews', 'totals.timeOnSite']

In [61]:
df_tl = init_top_lvl_df(df, numeric_cols)

In [62]:
df_tl.head(5)

Unnamed: 0,visitNumber,visitId,visitStartTime,fullVisitorId,clientId,channelGrouping,socialEngagementType,totals.hits,totals.pageviews,totals.timeOnSite,...,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.customerId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.targetingCriteria.boomUserlistId,totals.visits,totals.newVisits
0,2,1634813802,2021-10-21 10:56:42,6680643535448783441,1555458534.163468,(Other),Not Socially Engaged,122,14,694,...,0,(not set),(not set),(not set),(not set),(not set),True,(not set),0,0.0
1,1,1634763823,2021-10-20 21:03:43,4057339981351388207,944673079.163476,Social,Not Socially Engaged,144,17,1186,...,0,(not set),(not set),(not set),(not set),(not set),True,(not set),0,0.0
2,8,1635020464,2021-10-23 20:21:04,8103621635756043684,1886771441.163425,Generic Paid Search,Not Socially Engaged,42,7,172,...,1,Google search: Top,Myka,CjwKCAjw5c6LBhBdEiwAP9ejG-OtN9mub4lecy1rngwuzP...,5521908895,Google Search,True,(not set),0,0.0
3,1,1634984433,2021-10-23 10:20:33,3937357235259433457,916737419.163498,Generic Paid Search,Not Socially Engaged,46,6,123,...,1,Google search: Top,initial necklace,CjwKCAjw5c6LBhBdEiwAP9ejG1IieN4XHti_sns0oPK1TL...,5521908895,Google Search,True,(not set),0,0.0
4,1,1635007938,2021-10-23 16:52:18,6938942722601663938,1615598500.163501,Generic Paid Search,Not Socially Engaged,58,7,60,...,1,Google search: Top,name gold necklace,EAIaIQobChMIo8Gol__g8wIVFiCtBh3KhAjdEAAYASAAEg...,5521908895,Google Search,True,(not set),0,0.0


In [36]:
def filter_in_1(df, col, func, lst):
    x = df[col].loc[df[col].isin(lst)]
    return func(x)

In [63]:
def column_z(sr):
    s = np.std(sr)
    m = np.mean(sr)
    return sr.apply(lambda x: (x - m)/s if s > 0 else 0.0)

In [76]:
def filter_str_1(df, col, func, reg):
    x = df[col].loc[df[col].str.contains(reg, case=False)]
    return func(x)

In [None]:
df_features_tl = emotional_rational_toplvl_features(df_tl)