### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np
import pandas as pd
import json
import codecs
import os
import sys
import datetime as dt
sys.path.append('../')
import oci
import ocifs

In [2]:
from generate_data_extraction.features_functions import *
from generate_data_extraction.reduce_feature_list import *
from generate_data_extraction.common_columns_lists import *

In [26]:
bucket_name = "tenen-raw-data"
config = oci.config.from_file(
        "~/.apikey/config",
        "DEFAULT")
object_storage_client = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage_client.get_namespace().data
namespace

'lrhwvlagph9z'

In [4]:
hl_file = {'oct' : 'tenen_high_lvl_oct.csv', 'nov': 'tenen_high_lvl_nov.csv', 'dec': 'tenen_high_lvl_dec.csv' }
hits_file = {'oct' : 'tenen_hits_extend_oct.csv', 'nov': 'tenen_hits_extend_nov.csv', 'dec': 'tenen_hits_extend_dec.csv'}

In [11]:
fname = hl_file['dec']

In [12]:
directory = 'products'
config = {"config": "~/.apikey/config"}
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/products/tenen_high_lvl_dec.csv'

In [7]:
#namespace = "lrhwvlagph9z"
df_tl_oct = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_oct.shape

(46241, 65)

In [10]:
df_tl_nov = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_nov.shape

(101306, 60)

In [13]:
df_tl_dec = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_dec.shape

(93641, 61)

In [14]:
df_tl_oct['fullVisitorId'] = df_tl_oct['fullVisitorId'].astype(str)
df_tl_nov['fullVisitorId'] = df_tl_nov['fullVisitorId'].astype(str)
df_tl_dec['fullVisitorId'] = df_tl_dec['fullVisitorId'].astype(str)

In [15]:
df_list = [df_tl_oct, df_tl_nov, df_tl_dec]

In [16]:
df = pd.concat(df_list, ignore_index=True)
df.shape

(241188, 65)

In [17]:
tmp = df.groupby('fullVisitorId')['fullVisitorId'].agg('count')

In [18]:
len(tmp)

150549

In [19]:
numeric_cols = ['totals.hits', 'totals.pageviews', 'totals.timeOnSite']

In [20]:
df_tl = init_top_lvl_df(df, numeric_cols)
df_tl.shape

(108488, 65)

In [21]:
tmp = df_tl.groupby('fullVisitorId')['fullVisitorId'].agg('count')
len(tmp)

79651

In [36]:
X = emotional_rational_toplvl_features(df_tl)
X.shape

(79651, 10)

In [None]:
X1 = X.apply(lambda x: replace_nan_values(x))

In [38]:
X1.isnull().values.any()

False

In [45]:
X1.head(5)

Unnamed: 0_level_0,trafficSourceRatio,trafficSourceSocialRatio,trafficSourceSeRatio,mediumSourceFbRatio,mediumSourceCpcRatio,mediumSourceOrganicRatio,avg_hits,avg_page_views,avg_time_on_site,end_of_month_ratio
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000008988196085322,-0.172879,0.76479,-0.795198,1.378717,-0.714434,-0.318384,-0.675084,-0.726455,-0.597329,-0.221048
1000295312192619854,-0.172879,-0.55586,0.20133,-0.527781,-0.714434,0.617153,-0.334614,-0.281369,-0.418359,-0.221048
100032195141197061,-0.172879,0.76479,-0.795198,1.378717,-0.714434,-0.318384,-0.602126,-0.726455,-0.574958,-0.221048
100067031625967175,-0.172879,-0.55586,0.20133,-0.527781,-0.714434,0.617153,-0.213018,-0.281369,-0.477084,-0.221048
1000838440870549672,-0.172879,-0.55586,0.20133,-0.527781,0.342344,-0.318384,-0.699403,-0.503912,0.661055,-0.221048


In [5]:
directory = 'features'
fname = 'top_level_features.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/features/top_level_features.csv'

In [26]:
X1.to_csv(target, index=False, encoding='utf-8',storage_options = config)

In [10]:
directory = 'products'
fname = hits_file['dec']
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/products/tenen_hits_extend_dec.csv'

In [7]:
df_hits_oct = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_oct.shape

(918915, 67)

In [9]:
df_hits_nov = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_nov.shape

(2348379, 67)

In [11]:
df_hits_dec = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_dec.shape

(1831008, 67)

In [12]:
df_hits_oct['fullVisitorId'] = df_hits_oct['fullVisitorId'].astype(str)
df_hits_nov['fullVisitorId'] = df_hits_nov['fullVisitorId'].astype(str) 
df_hits_dec['fullVisitorId'] = df_hits_dec['fullVisitorId'].astype(str) 

In [13]:
df_list = [df_hits_oct, df_hits_nov, df_hits_dec]
dfh = pd.concat(df_list, ignore_index=True)
dfh.shape

(5098302, 67)

In [14]:
dfh['fullVisitorId'].head(5)

0    6680643535448783441
1    6680643535448783441
2    6680643535448783441
3    6680643535448783441
4    6680643535448783441
Name: fullVisitorId, dtype: object

In [15]:
numeric_cols = ['numOfProducts', 'hits.eCommerceAction.action_type', 'hits.time']

In [16]:
df_hits = init_hits_df(dfh, numeric_cols, 'hits.time')
df_hits.shape

(4356126, 68)

In [17]:
tmp = df_hits.groupby('fullVisitorId')['fullVisitorId'].agg('count')
len(tmp)

79651

In [18]:
Y = emotional_rational_hits_features(df_hits)
Y.shape

(79651, 10)

In [19]:
Y1 = Y.apply(lambda x: replace_nan_values(x))

In [20]:
Y1.isnull().values.any()

False

In [29]:
Y1.head(5)

Unnamed: 0_level_0,hits.products_per_session,hits.avg_viewd_product,hits.avg_time2hit,hits.avg_time_add_prod,hits.avg_time_rmv_prod,hits.avg_payment_page,hits.avg_support_page,hits.avg_article_page,hits.avg_shopping_cart,hits.avg_search_keyword
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000008988196085322,-0.555664,-0.645755,-0.387993,-10.851024,-10.911458,-11.717352,-11.208384,-10.735552,-10.813007,-10.813007
1000295312192619854,-0.163786,-0.645755,-0.25967,-10.851024,-10.911458,-11.717352,-11.208384,-10.735552,-10.813007,-10.813007
100032195141197061,-0.555664,-0.645755,-0.36968,-10.851024,-10.911458,-11.717352,-11.208384,-10.735552,-10.813007,-10.813007
100067031625967175,-0.546334,-0.253261,-0.34873,-10.851024,-10.911458,-11.717352,-11.208384,-0.537366,-10.813007,-10.813007
1000838440870549672,-0.546334,-0.253261,3.2784,-10.851024,-10.911458,-11.717352,-11.208384,-10.735552,-10.813007,-10.813007


In [24]:
directory = 'features'
fname = 'hits_features.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/features/hits_features.csv'

In [25]:
Y1.to_csv(target, index=False, encoding='utf-8',storage_options = config)

ServiceError: {'opc-request-id': '/EB55FE4BDBA571BE08CF90B9174D37A6/1B80E924A09782437F8C22E4F74EEDA2', 'code': 'NotAuthenticated', 'message': 'The required information to complete authentication was not provided or was incorrect.', 'status': 401}

In [28]:
Z = pd.concat([X,Y1], axis=1)

NameError: name 'X' is not defined

In [None]:
fname = 'full_features.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
Z.to_csv(target, index=False, encoding='utf-8',storage_options = config)