# Required packages:
 * `openpyxl` (for reading Excel files)
 * `sentence-transformers` (for generating embeddings with HuggingFace models)
 * `synapseml` (for calling embedding API)

 These should be loaded on your cluster under 'Compute' > 'Libraries' > 'Install New' (Library Source = PyPi).

In [0]:
secrets = {'storage_account_name':'sentenceembeddingdemos',
           'container_name':'demo-data',
           'data_path': '/',
           'sas_token': 'PUT_YOUR_SAS_TOKEN_HERE'}

# secrets['sas_token'] = ""


In [0]:
if secrets['sas_token'] == 'PUT_YOUR_SAS_TOKEN_HERE':
    displayHTML('''<blink><font color="red"><h1>You need to enter your connection info in the 'secrets' dict!</h1></font></blink>''')

In [0]:
DB_NAME = 'se_demo'

DATA_SOURCE = "wasbs://{container_name}@{storage_account_name}.blob.core.windows.net".format(**secrets)

DATA_PATH = secrets['data_path']

MOUNT_POINT = f"/mnt/{DB_NAME}"

EXTRA_CONFIGS = {"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net".format(**secrets): secrets['sas_token']}

CURRENTLY_MOUNTED = {mount_info.mountPoint for mount_info in dbutils.fs.mounts()}
if MOUNT_POINT in CURRENTLY_MOUNTED: 
  dbutils.fs.unmount(MOUNT_POINT)

dbutils.fs.mount(
    source = DATA_SOURCE + DATA_PATH,
    mount_point = MOUNT_POINT,
    extra_configs = EXTRA_CONFIGS
)

[f.name for f in dbutils.fs.ls(MOUNT_POINT)]

Out[21]: ['README.md',
 'bias_detection/',
 'hello_dolly/',
 'sidney_in_love/',
 'tag_curation/']

In [0]:
import re
import pandas as pd

spark.sql(f"create database if not exists {DB_NAME}")
spark.sql(f"use {DB_NAME}")

mnt_subdirs = [ fi.path for fi in dbutils.fs.ls(MOUNT_POINT) if fi.path.endswith('/')]
for subdir in subdirs:
  print(subdir)
  for file_info in dbutils.fs.ls(subdir):
    if not file_info.path.endswith('/'):
      print(f"Processing file '{file_info.path}")
      table_name, file_ext = file_info.name.rsplit('.', 1)
      print('table_name:', table_name, 'file_ext:', file_ext)
      print(f"Creating table '{DB_NAME}.{table_name}' from file {file_info.path}")
      if file_ext == 'csv':
        sdf = spark.read.options(header=True).csv(file_info.path)
      elif file_ext == 'xlsx':
        local_path = file_info.path.replace('dbfs:', '/dbfs')
        pd.read_excel(local_path, index_col=0)  # requires 'openpyxl' package
        sdf = spark.createDataFrame(pdf)
      elif file_ext == 'parquet':
        sdf = spark.read.options(header=True).parquet(file_info.path)
      else:
        print(f"Unexpected file extension '{file_ext}'")
        break
      sdf.write.mode('overwrite').saveAsTable(table_name)


dbfs:/mnt/se_demo/bias_detection/
Processing file 'dbfs:/mnt/se_demo/bias_detection/generic_occupation_stereotype.xlsx
table_name: generic_occupation_stereotype file_ext: xlsx
Creating table 'se_demo.generic_occupation_stereotype' from file dbfs:/mnt/se_demo/bias_detection/generic_occupation_stereotype.xlsx
Processing file 'dbfs:/mnt/se_demo/bias_detection/structured_stereotypes.xlsx
table_name: structured_stereotypes file_ext: xlsx
Creating table 'se_demo.structured_stereotypes' from file dbfs:/mnt/se_demo/bias_detection/structured_stereotypes.xlsx
dbfs:/mnt/se_demo/hello_dolly/
Processing file 'dbfs:/mnt/se_demo/hello_dolly/dolly_data_featurized.parquet
table_name: dolly_data_featurized file_ext: parquet
Creating table 'se_demo.dolly_data_featurized' from file dbfs:/mnt/se_demo/hello_dolly/dolly_data_featurized.parquet
dbfs:/mnt/se_demo/sidney_in_love/
dbfs:/mnt/se_demo/tag_curation/
Processing file 'dbfs:/mnt/se_demo/tag_curation/ai_Posts.csv
table_name: ai_Posts file_ext: csv
Creat

In [0]:
%sql

use se_demo;
show tables;

database,tableName,isTemporary
se_demo,ai_posts,False
se_demo,ai_posts_featurized,False
se_demo,dolly_data_featurized,False
se_demo,generic_occupation_stereotype,False
se_demo,structured_stereotypes,False


# Featurize generic_occupation_stereotype

In [0]:
def featurize(pdf):
  from sentence_transformers import SentenceTransformer
  sentxformer = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') # 'sentence-transformers/all-mpnet-base-v2'
  pdf['embedding'] = sentxformer.encode(pdf['generic']).tolist()
  return pdf

Q = "select int(abs(hash(generic))) % 64 bucket, * from generic_occupation_stereotype"

sample_data = spark.sql(Q + " limit 100").toPandas()
sample_data_featurized_pdf = featurize(sample_data)

sample_data_featurized_pdf

Unnamed: 0,bucket,stereotype,generic,occupation,embedding
0,23,CEOs are always looking for ways to make more ...,They are always looking for ways to make more ...,CEOs,"[0.1738528162240982, -0.298277348279953, -0.32..."
1,53,CEOs are power-hungry and only care about them...,They are power-hungry and only care about them...,CEOs,"[-0.11597710102796555, 0.23442350327968597, -0..."
2,24,CEOs are always working and never have any tim...,They are always working and never have any tim...,CEOs,"[-0.1919153481721878, -0.15998907387256622, -0..."
3,62,CEOs are unapproachable and think they are bet...,They are unapproachable and think they are bet...,CEOs,"[0.3194783329963684, -0.10300444066524506, -0...."
4,24,"CEOs are cold and calculating, and only care a...","They are cold and calculating, and only care a...",CEOs,"[-0.37241342663764954, 0.6755173802375793, -0...."
...,...,...,...,...,...
95,31,"Doctors only care about the symptoms, not the ...","They only care about the symptoms, not the pat...",Doctors,"[0.0405334010720253, 0.1177128478884697, 0.093..."
96,10,Doctors are always trying to sell you something.,They are always trying to sell you something.,Doctors,"[-0.21937884390354156, -0.356390118598938, -0...."
97,36,Doctors are always right.,They are always right.,Doctors,"[0.12619201838970184, -0.25885850191116333, -0..."
98,38,Doctors know everything.,They know everything.,Doctors,"[-0.08524701744318008, 0.005694170948117971, 0..."


In [0]:
featurized_schema = spark.createDataFrame(sample_data_featurized_pdf).schema
featurized_schema

Out[63]: StructType([StructField('bucket', IntegerType(), True), StructField('stereotype', StringType(), True), StructField('generic', StringType(), True), StructField('occupation', StringType(), True), StructField('embedding', ArrayType(DoubleType(), True), True)])

In [0]:
import pandas as pd  
from pyspark.sql.functions import pandas_udf, ceil

featurized_data = sql(Q).groupby("bucket").applyInPandas(featurize, featurized_schema)

In [0]:
featurized_data.toPandas()

Unnamed: 0,bucket,stereotype,generic,occupation,embedding
0,0,All programmers are socially awkward.,They are socially awkward.,Programmers,"[0.371877521276474, 0.06199703738093376, 0.097..."
1,0,All programmers are geniuses.,They are geniuses.,Programmers,"[-0.5986257195472717, -0.052797626703977585, -..."
2,0,Bankers are always trying to make a quick buck.,They are always trying to make a quick buck.,Bankers,"[-0.3954196274280548, -0.18436558544635773, -0..."
3,0,Bankers are boring and have no sense of humor.,They are boring and have no sense of humor.,Bankers,"[0.14831893146038055, -0.2876422703266144, 0.2..."
4,0,Chefs are always yelling.,They are always yelling.,Chefs,"[0.07941865921020508, 0.09496933966875076, -0...."
...,...,...,...,...,...
3353,63,Engineers are not creative.,They are not creative.,Engineers,"[0.20297721028327942, -0.29839426279067993, -0..."
3354,63,Engineers are not creative.,They are not creative.,Engineers,"[0.20297721028327942, -0.29839426279067993, -0..."
3355,63,Janitors are not stylish.,They are not stylish.,Janitors,"[0.21393610537052155, 0.015848703682422638, -0..."
3356,63,Janitors are unkempt.,They are unkempt.,Janitors,"[0.263754278421402, -0.12612855434417725, 0.29..."
