### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
import numpy as np
import pandas as pd
import json
import codecs
import os
import sys
import datetime as dt
sys.path.append('../')
import oci
import ocifs

In [None]:
from generate_data_extraction.features_functions import *
from generate_data_extraction.emotional_rational_features import *
from generate_data_extraction.common_columns_lists import *
from generate_data_extraction.init_dataframe import *

In [None]:
def column_minmax_score(sr):
    n = np.min(sr)
    diff = np.max(sr) - np.min(sr)
    r = (sr - n) / diff
    return r

In [None]:
bucket_name = "tenen-raw-data"
config = oci.config.from_file(
        "~/.apikey/config",
        "DEFAULT")
object_storage_client = oci.object_storage.ObjectStorageClient(config)
namespace = object_storage_client.get_namespace().data
namespace

In [None]:
hl_file = {'oct' : 'tenen_high_lvl_oct.csv', 'nov': 'tenen_high_lvl_nov.csv', 'dec': 'tenen_high_lvl_dec.csv' }
hits_file = {'oct' : 'tenen_hits_extend_oct.csv', 'nov': 'tenen_hits_extend_nov.csv', 'dec': 'tenen_hits_extend_dec.csv'}

In [None]:
fname = hl_file['dec']
directory = 'products'
config = {"config": "~/.apikey/config"}
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
#namespace = "lrhwvlagph9z"
df_tl_oct = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_oct.shape

In [None]:
df_tl_nov = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_nov.shape

In [None]:
df_tl_dec = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_tl_dec.shape

In [None]:
df_tl_oct['fullVisitorId'] = df_tl_oct['fullVisitorId'].astype(str)
df_tl_nov['fullVisitorId'] = df_tl_nov['fullVisitorId'].astype(str)
df_tl_dec['fullVisitorId'] = df_tl_dec['fullVisitorId'].astype(str)

In [None]:
df_list = [df_tl_oct, df_tl_nov, df_tl_dec]

In [None]:
df = pd.concat(df_list, ignore_index=True)
df.shape

In [None]:
numeric_cols = ['totals.hits', 'totals.pageviews', 'totals.timeOnSite']

In [None]:
df_tl = init_top_lvl_df(df, numeric_cols)
df_tl.shape

In [107]:
df_tl.shape

(108488, 65)

In [105]:
tmp = df_tl.groupby('fullVisitorId')['fullVisitorId'].agg('count')
len(tmp)

79651

In [None]:
X = emotional_rational_toplvl_features(df_tl)
X.shape

In [None]:
X.isnull().values.any()

In [None]:
X.head(5)

In [None]:
X1 = X.apply(lambda x : column_z_score(x))
X1.shape

In [None]:
X1.isnull().values.any()

In [None]:
X1.head(5)

In [142]:
directory = 'features'
fname = 'top_level_full_data_with_label.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/products/top_level_full_data_with_label.csv'

In [143]:
X1.to_csv(target, index=True, encoding='utf-8',storage_options = config)

In [None]:
X2 = X.apply(lambda x : column_minmax_score(x))
X2.shape

In [None]:
# if reading from bucket
X1 = pd.read_csv(target, storage_options={"config": "~/.apikey/config"}, low_memory=False)

In [None]:
directory = 'products'
fname = hits_file['dec']
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
df_hits_oct = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_oct.shape

In [None]:
df_hits_nov = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_nov.shape

In [None]:
df_hits_dec = pd.read_csv(target, storage_options={"config": "~/.apikey/config"},low_memory=False)
df_hits_dec.shape

In [None]:
df_hits_oct['fullVisitorId'] = df_hits_oct['fullVisitorId'].astype(str)
df_hits_nov['fullVisitorId'] = df_hits_nov['fullVisitorId'].astype(str) 
df_hits_dec['fullVisitorId'] = df_hits_dec['fullVisitorId'].astype(str) 

In [None]:
df_list = [df_hits_oct, df_hits_nov, df_hits_dec]
dfh = pd.concat(df_list, ignore_index=True)
dfh.shape

In [None]:
dfh['fullVisitorId'].head(5)

In [None]:
numeric_cols = ['numOfProducts', 'hits.eCommerceAction.action_type', 'hits.time']

In [None]:
df_hits = init_hits_df(dfh, numeric_cols, 'hits.time')
df_hits.shape

In [None]:
tmp = df_hits.groupby('fullVisitorId')['fullVisitorId'].agg('count')
len(tmp)

In [145]:
df_hits.customer_type.isnull().values.any()

False

In [None]:
Y = emotional_rational_hits_features(df_hits)
Y.shape

In [None]:
Y.isnull().values.any()

In [None]:
Y1 = Y.fillna(0)
Y1.isna().sum()

In [None]:
Y1 = Y1.apply(lambda x : column_z_score(x))
Y1.shape

In [None]:
Y1.isnull().values.any()

In [None]:
Y1.head()

In [None]:
directory = 'features'
fname = 'hits_features_minmax.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
Y2.to_csv(target, index=True, encoding='utf-8',storage_options = config)

In [None]:
Y2 = Y.fillna(0)
Y2 = Y2.apply(lambda x : column_minmax_score(x))
Y2.shape

In [None]:
Y2.head(5)

In [None]:
Z = pd.concat([X1,Y1], axis=1)
Z.shape

In [None]:
Z0 = pd.concat([X2,Y2], axis=1)
Z0.shape

In [None]:
Z0.head(5)

In [None]:
fname = 'full_features_minmax.csv'
directory = 'features'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
Z0.to_csv(target, index=True, encoding='utf-8',storage_options = config)

In [None]:
emotional_high = ['trafficSourceRatio', 
                  'trafficSourceSocialRatio', 
                  'mediumSourceFbRatio',
                  'mediumSourceCpcRatio',
                  'hits.products_per_session',
                  'avg_page_views',
                  'hits.avg_viewd_product',
                  'hits.product_page']
emotional_low = ['trafficSourceSeRatio',
                 'mediumSourceOrganicRatio', 
                 'avg_hits',  
                 'avg_time_on_site', 
                 'end_of_month_ratio', 
                 'hits.avg_time2hit', 
                 'hits.avg_time_add_prod',
                 'hits.avg_time_rmv_prod',
                 'hits.support_page',
                 'hits.article_page', 
                 'hits.search_keyword',
                 'hits.category_page']

In [None]:
def convert_by_minmax(df, col_list) :
    def f(x, is_max):
        m = np.max(x)
        n = np.min(x)
        r = m if is_max else n
        return r
    v =  df.apply(lambda y: f(y, True) if y.name in col_list else f(y, False))
    return v 

In [None]:
Z_lbl = convert_by_z(Z, emotional_high)

In [None]:
Z_lbl

In [None]:
Z_MM = convert_by_minmax(Z0, emotional_high)
Z_MM.head(5)

In [None]:
directory = 'features'
fname = 'corr_features_tag_vector_minmax.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

In [None]:
def corr_vec_to_cols(df, vec):
    def f(n):
        if n > 0:  # high corr to emotional = 1
            return 'emotio'
        elif n < 0 : # low corr to emotional , rational = 2
            return 'ratio'
        else:  # no corr at all
            return 'nocorr'
    dft = df.T
    z = dft.apply(lambda x : x.corr(vec))
    res = z.map(lambda x : f(x))
    return res

In [None]:
C = corr_vec_to_cols(Z, Z_lbl).to_frame('corr')

In [None]:
C.isnull().values.any()

In [None]:
C0.to_csv(target, index=True, encoding='utf-8',storage_options = config)

In [None]:
C0 = corr_vec_to_cols(Z0, Z_MM).to_frame('corr')
C0.head(10)

In [111]:
len(C0.loc[C0['corr'] == 'nocorr'])

0

In [112]:
len(C0.loc[C0['corr'] == 'emotio'])

62987

In [113]:
len(C0.loc[C0['corr'] == 'ratio'])

16664

In [None]:
df_tl['customer_type'] = df_tl['fullVisitorId'].apply(lambda x : C0.loc[x] if x in C0.index.to_list() else 'none')

In [None]:
df_hits['customer_type'] = df_hits['fullVisitorId'].apply(lambda x : C0.loc[x] if x in C0.index.to_list() else 'none')

In [146]:
directory = 'products'
fname = 'hits_full_data_with_label.csv'
target = f'oci://{bucket_name}@{namespace}/{directory}/{fname}'
target

'oci://tenen-raw-data@lrhwvlagph9z/products/hits_full_data_with_label.csv'

In [147]:
df_hits.to_csv(target, index=True, encoding='utf-8',storage_options = config)