#### 1. Installing the required python packages

In [0]:
# !python -m pip install --upgrade pip

In [0]:
# !pip install -r requirements38.txt

In [0]:
# !pip install azure-storage-file-datalake
# !pip install adlfs
# !pip install fsspec

#### 2. Importing the required python packages

In [0]:
import multiprocessing as mp
import sys
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ast import literal_eval
from pyarrow.parquet import ParquetFile

from sherlock import helpers
from sherlock.features.paragraph_vectors import (
    initialise_nltk,
    tagcol_paragraph_embeddings_features,
    train_paragraph_embeddings_features
)
from sherlock.features.preprocessing import convert_string_lists_to_lists
from sherlock.functional import extract_features_to_csv



In [0]:
print(f'Started at {datetime.now()}')

Started at 2022-12-15 07:23:54.443349


#### 3. Connect to Azure Data Storage

In [0]:
import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from azure.storage.blob import BlobServiceClient

In [0]:
def initialize_storage_account(storage_account_name, storage_account_key):
    
    try:  
        global service_client

        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
            "https", storage_account_name), credential=storage_account_key)
    
    except Exception as e:
        print(e)

In [0]:
storage_account = "legoaistorage"
storage_account_key = "vOHAjE9vOHaxqmTRxIYETQbYlPvvFpJQ7xfky8tuWBRE9E6IbfM87ERkGcqqiHfMHs+WnEt907r6+AStjIYXlA=="
initialize_storage_account(storage_account,storage_account_key)

In [0]:
### Spark Configuration
spark.conf.set("fs.azure.account.key."+ storage_account +".dfs.core.windows.net", storage_account_key)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [0]:
prefix_path = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"

#### 4. Getting the required dataset from the Azure data storage

In [0]:
filepath = 'Sherlock_Raw_Data/data/raw/train_values.parquet'
filepath_upd = prefix_path + filepath
train_samples = spark.read.format("parquet").load(filepath_upd)
train_samples = train_samples.toPandas()

In [0]:
filepath = 'Sherlock_Raw_Data/data/raw/train_labels.parquet'
filepath_upd = prefix_path + filepath
train_labels = spark.read.format("parquet").load(filepath_upd)
train_labels = train_labels.toPandas()

In [0]:
# train_samples = train_samples[:1000]
# train_labels = train_labels[:1000]

In [0]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

  0%|          | 0/412059 [00:00<?, ?it/s]  0%|          | 510/412059 [00:00<02:47, 2458.26it/s]  0%|          | 1017/412059 [00:00<02:43, 2508.49it/s]  1%|          | 2786/412059 [00:00<02:03, 3311.66it/s]  1%|▏         | 5850/412059 [00:00<01:29, 4514.92it/s]  2%|▏         | 7079/412059 [00:00<01:13, 5509.31it/s]  2%|▏         | 8280/412059 [00:00<01:02, 6427.04it/s]  3%|▎         | 10449/412059 [00:01<00:55, 7207.94it/s]  3%|▎         | 12512/412059 [00:01<00:44, 8955.39it/s]  3%|▎         | 13909/412059 [00:01<00:41, 9606.74it/s]  4%|▎         | 15234/412059 [00:01<00:42, 9308.41it/s]  4%|▍         | 17037/412059 [00:01<00:36, 10888.45it/s]  4%|▍         | 18410/412059 [00:01<00:39, 9856.59it/s]   5%|▌         | 20964/412059 [00:01<00:32, 12082.34it/s]  5%|▌         | 22585/412059 [00:02<00:36, 10562.04it/s]  6%|▌         | 23956/412059 [00:02<00:38, 10105.72it/s]  6%|▌         | 25261/412059 [00:02<00:35, 10832.44it/s]  6%|▋         | 26516/412059 [00:02<00:40, 9

#### 5. Train Doc2Vec

In [0]:
initialise_nltk()

Initialised NLTK, process took 0:00:00.309226 seconds.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
samples = train_samples_converted.dropna()
print(f'Samples: {type(samples)}, length={len(samples)}')

labels = train_labels.values.flatten()
print(f'Labels:  {type(labels)}, length={len(labels)}')

Samples: <class 'pandas.core.series.Series'>, length=412059
Labels:  <class 'numpy.ndarray'>, length=824118


In [0]:
# ### Converting the data to string type
# samples = samples.apply(lambda x: [str(val) for val in x])

In [0]:
import random
import nltk
from nltk.corpus import stopwords
import gensim.models.doc2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing

In [0]:
def tokenise(values):
    joined = " ".join(s for s in values if len(s) >= 2)

    # stopwords need apostrophe
    filtered = "".join(
        e for e in joined if e.isalnum() or e.isspace() or e == "'"
    ).lower()

    return [
        word
        for word in nltk.word_tokenize(filtered)
        if len(word) >= 2 and word not in STOPWORDS_ENGLISH
    ]

In [0]:
def tagcol_paragraph_embeddings_features(train_data: pd.Series, train_labels: list):
    random.seed(13)

    columns = []

    for i, col in enumerate(train_data):
        label = train_labels[i]
        values = random.sample(col, min(1000, len(col)))

        if len(values) > 0:
            values = list(map(lambda s: "" if s is None else str(s), values))

        tokens = tokenise(values)

        columns.append(TaggedDocument(tokens, [label]))

    return columns

In [0]:
start = datetime.now()
STOPWORDS_ENGLISH = stopwords.words("english")

print('Tagging columns')
cols = tagcol_paragraph_embeddings_features(samples, labels)

print(f'Tagged Columns Doc2Vec Model, process took {datetime.now() - start} seconds.')

Tagging columns
Tagged Columns Doc2Vec Model, process took 0:02:56.589617 seconds.


In [0]:
start = datetime.now()

vec_dim = 400
print(f'Training Doc2Vec model in {vec_dim} dimensions')

train_model = Doc2Vec(cols,dm=0,negative=3,workers=multiprocessing.cpu_count(),vector_size=vec_dim,epochs=20,min_count=2,seed=13)

print(datetime.now())

Training Doc2Vec model in 400 dimensions


In [0]:
pwd

Out[39]: '/Workspace/Repos/santhosh.kumar@legoaiaccel.com/sherlock-project'

In [0]:
train_model.save('/par_vec_trained_model_400.pkl')

# with open('sherlock/features/par_vec_trained_model_400.pkl', 'wb') as pkl:
#     pickle.dump(train_model,pkl)

# import pickle
# pickle.dump(train_model, open('sherlock/features/par_vec_trained_model_400.pkl', 'wb'))
# pickled_model = pickle.load(open('model.pkl', 'rb'))


In [0]:
# sample = pd.DataFrame({'id':[1,2,3,4],'name':['s','s','d','f']})
# sample.to_csv('sherlock/features/sample.csv',index=False)

#### 6. Prediction using Doc2Vec Model

In [0]:
doc2vec_model = Doc2Vec.load('/par_vec_trained_model_400.pkl')

In [0]:
doc2vec_model.random.seed(13)
doc2vec_model.infer_vector(['sample','text'])