### This notebook loads data for similarity calculation in next notebook

In [1]:
import pandas as pd
import numpy as np
import time
import json
from cachetools import cached
from time import perf_counter
from typing import Iterable, Tuple
# Snowpark Imports
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.functions import udf, udtf

# Reading Snowflake Connection Details
snowflake_connection_cfg = json.loads(open('snowflake-creds.json').read())
session = Session.builder.configs(snowflake_connection_cfg).create()

### Choose a SMALL warehouse to load the data

In [2]:
session.sql(f"ALTER WAREHOUSE {session.get_current_warehouse()} set WAREHOUSE_SIZE=SMALL").collect()

[Row(status='Statement executed successfully.')]

In [3]:
# Collection of sample texts used for random data generation
texts = ['Snowflake Inc. is a cloud computing–based data cloud company based in Bozeman, Montana.',
         'In data analysis, cosine similarity is a measure of similarity between two non-zero vectors defined in an inner product space. ',
         'Netflix, Inc. is an American media company based in Los Gatos, California.',
         'Avatar is an American media franchise created by James Cameron, which consists of a planned series of epic science fiction films.']

### Set the number of documents, and calculate the corresponding number of cross joined document comparisons

In [4]:
number_of_documents = 100000
cross_joined_comparisons = number_of_documents * number_of_documents
print(f"Number of cross joined comparisons that we will do with cosine similarity in next notebook: {round(cross_joined_comparisons)/1000000} Million")

Number of cross joined comparisons that we will do with cosine similarity in next notebook: 10000.0 Million


### Generate the dataset

In [5]:
# Generate sample dataset
texts_df = session.generator(F.seq8().as_('TEXT_ID'),
                             F.get(F.lit(texts),F.lit(F.uniform(0,len(texts)-1,F.random()))).cast(T.StringType()).as_('TEXT'),
                             rowcount=number_of_documents)
texts_df.write.save_as_table('SAMPLE_TEXTS', mode='overwrite')
texts_df = session.table('SAMPLE_TEXTS')
texts_df.show()

------------------------------------------------------------------
|"TEXT_ID"  |"TEXT"                                              |
------------------------------------------------------------------
|0          |Snowflake Inc. is a cloud computing–based data ...  |
|1          |Netflix, Inc. is an American media company base...  |
|2          |Avatar is an American media franchise created b...  |
|3          |In data analysis, cosine similarity is a measur...  |
|4          |Avatar is an American media franchise created b...  |
|5          |In data analysis, cosine similarity is a measur...  |
|6          |Netflix, Inc. is an American media company base...  |
|7          |Avatar is an American media franchise created b...  |
|8          |Avatar is an American media franchise created b...  |
|9          |Avatar is an American media franchise created b...  |
------------------------------------------------------------------



In [6]:
texts_df.count()

100000

In [7]:
session.close()