## Test Turning Documents to Embeddings

#### The purpose of this notebook is to do a dummy test of how embeddings will turn out without prior processing of the data and just feeding it to a doc2vec model as is.

#### Test with cosine similarity function for sanity check.

#### Note: Notebook was run locally and will not work in SageMaker as no doc2vec was installed.

In [7]:
import pandas as pd
import numpy as np
import os
#from gensim.models import Doc2Vec
#import gensim
#from gensim.models.doc2vec import TaggedDocument
import re
import glob
import json

In [2]:
# Specify the path to the folder containing the .md files
folder_path = '../data/raw'

# Use glob to find all .md files in the specified folder
md_files = glob.glob(os.path.join(folder_path, '*.md'))

In [3]:
# Create an empty list to store the data
data = []

# Read the contents of each .md file and store in the list
for file in md_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        # Get the file name without extension
        doc_name = os.path.splitext(os.path.basename(file))[0]  
        data.append({'id': doc_name, 'text': content})

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

                                                    id  \
0                            amazon-sagemaker-toolkits   
1                    asff-resourcedetails-awssagemaker   
2                automating-sagemaker-with-eventbridge   
3    aws-properties-events-rule-sagemakerpipelinepa...   
4    aws-properties-events-rule-sagemakerpipelinepa...   
..                                                 ...   
331                              sagemaker-rl-workflow   
332                                    sagemaker-roles   
333              services-that-can-integrate-sagemaker   
334     train-evaluate-models-using-sagemaker-notebook   
335                         use-sagemaker-edge-manager   

                                                  text  
0    # Using the SageMaker Training and Inference T...  
1    # AwsSageMaker<a name="asff-resourcedetails-aw...  
2    # Automating Amazon SageMaker with Amazon Even...  
3    # AWS::Events::Rule SageMakerPipelineParameter...  
4    # AWS::Events

In [4]:
df.head()

Unnamed: 0,id,text
0,amazon-sagemaker-toolkits,# Using the SageMaker Training and Inference T...
1,asff-resourcedetails-awssagemaker,"# AwsSageMaker<a name=""asff-resourcedetails-aw..."
2,automating-sagemaker-with-eventbridge,# Automating Amazon SageMaker with Amazon Even...
3,aws-properties-events-rule-sagemakerpipelinepa...,# AWS::Events::Rule SageMakerPipelineParameter...
4,aws-properties-events-rule-sagemakerpipelinepa...,# AWS::Events::Rule SageMakerPipelineParameter...


In [5]:
df.iloc[0][1]

'# Using the SageMaker Training and Inference Toolkits<a name="amazon-sagemaker-toolkits"></a>\n\nThe [SageMaker Training](https://github.com/aws/sagemaker-training-toolkit) and [SageMaker Inference](https://github.com/aws/sagemaker-inference-toolkit) toolkits implement the functionality that you need to adapt your containers to run scripts, train algorithms, and deploy models on SageMaker\\. When installed, the library defines the following for users:\n+ The locations for storing code and other resources\\. \n+ The entry point that contains the code to run when the container is started\\. Your Dockerfile must copy the code that needs to be run into the location expected by a container that is compatible with SageMaker\\. \n+ Other information that a container needs to manage deployments for training and inference\\. \n\n## SageMaker Toolkits Containers Structure<a name="sagemaker-toolkits-structure"></a>\n\nWhen SageMaker trains a model, it creates the following file folder structure 

In [6]:
docs = list()
for index, row in df.iterrows():
    docs.append(TaggedDocument(row['text'].lower().split(), row['id']))

In [7]:
df['chunks'] = docs

In [8]:
# Load a pre-trained doc2vec model

#model = Doc2Vec.load("/enwiki_dbow/doc2vec.bin")

#Train a doc2vec model on the training data
model = Doc2Vec(min_count=3, window=50, vector_size=150, workers=4, alpha=0.025, min_alpha=0.00025, dm=1)
model.build_vocab(docs)
model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

In [9]:
# Infer the sentences and create a new dictionary where for each movie the vectors of the submissions where it appears
# will show up

inf_sents = []

for index, row in df.iterrows():
    inferred_sentence = model.infer_vector(row['chunks'][0]).tolist()
    inf_sents.append(inferred_sentence)

df['vector'] = inf_sents

In [10]:
df

Unnamed: 0,id,text,chunks,vector
0,amazon-sagemaker-toolkits,# Using the SageMaker Training and Inference T...,"([#, using, the, sagemaker, training, and, inf...","[-0.1181814894080162, -3.0908892154693604, -1...."
1,asff-resourcedetails-awssagemaker,"# AwsSageMaker<a name=""asff-resourcedetails-aw...","([#, awssagemaker<a, name=""asff-resourcedetail...","[-0.005288412794470787, -0.6258431077003479, -..."
2,automating-sagemaker-with-eventbridge,# Automating Amazon SageMaker with Amazon Even...,"([#, automating, amazon, sagemaker, with, amaz...","[-1.0848573446273804, -3.225484609603882, -3.7..."
3,aws-properties-events-rule-sagemakerpipelinepa...,# AWS::Events::Rule SageMakerPipelineParameter...,"([#, aws::events::rule, sagemakerpipelineparam...","[-0.18435446918010712, -0.2398490160703659, -0..."
4,aws-properties-events-rule-sagemakerpipelinepa...,# AWS::Events::Rule SageMakerPipelineParameter...,"([#, aws::events::rule, sagemakerpipelineparam...","[-0.2381533533334732, -0.4498632252216339, -0...."
...,...,...,...,...
331,sagemaker-rl-workflow,# Sample RL Workflow Using Amazon SageMaker RL...,"([#, sample, rl, workflow, using, amazon, sage...","[0.013776501640677452, -1.964380145072937, -1...."
332,sagemaker-roles,"# SageMaker Roles<a name=""sagemaker-roles""></a...","([#, sagemaker, roles<a, name=""sagemaker-roles...","[-0.3741878867149353, -3.3700742721557617, -2...."
333,services-that-can-integrate-sagemaker,# Amazon SageMaker and Application Auto Scalin...,"([#, amazon, sagemaker, and, application, auto...","[-0.3637629449367523, -1.4911519289016724, -1...."
334,train-evaluate-models-using-sagemaker-notebook,# Train and Evaluate AWS DeepRacer Models Usin...,"([#, train, and, evaluate, aws, deepracer, mod...","[-0.4809817969799042, -7.184512615203857, -3.2..."


In [11]:
def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two 
    vector embeddings.

    Returns:
    - Cosine similarity: A float value between -1 and 1.
    """
    # Ensure the vectors are numpy arrays
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    # Compute the dot product between the two vectors
    dot_product = np.dot(vec1, vec2)

    # Compute the L2 norms (magnitudes) of the vectors
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Calculate the cosine similarity
    cosine_sim = dot_product / (norm_vec1 * norm_vec2)

    return cosine_sim

In [13]:
df.iloc[0]['vector']

[-0.1181814894080162,
 -3.0908892154693604,
 -1.736112356185913,
 0.2511509954929352,
 -0.23577211797237396,
 -0.03822161629796028,
 -0.2445712685585022,
 -0.8660224080085754,
 -0.4731580913066864,
 0.1082979291677475,
 -0.13967140018939972,
 -0.3961888253688812,
 0.31445443630218506,
 1.1530908346176147,
 -0.14591149985790253,
 0.10137495398521423,
 1.0784485340118408,
 0.40090158581733704,
 -0.41255325078964233,
 -0.2285449206829071,
 0.08738469332456589,
 -0.8943641781806946,
 0.12962990999221802,
 0.885036051273346,
 0.7460244297981262,
 0.86463463306427,
 0.15630805492401123,
 0.06582926958799362,
 -0.354600191116333,
 -0.7501123547554016,
 -0.16951961815357208,
 1.1922591924667358,
 0.28919246792793274,
 0.051554735749959946,
 0.0627203956246376,
 0.3186033368110657,
 1.0731112957000732,
 -0.17338716983795166,
 0.946013867855072,
 1.0521533489227295,
 -0.26154670119285583,
 0.10744250565767288,
 -0.5542064905166626,
 -0.09652897715568542,
 -0.3126221299171448,
 -0.617991685867309

In [14]:
cosine_similarity(df.iloc[0]['vector'], df.iloc[1]['vector'])

0.9552895216290899

In [15]:
cosine_similarity(df.iloc[0]['vector'], df.iloc[9]['vector'])

0.7101502267509925

In [16]:
cosine_similarity(df.iloc[0]['vector'], df.iloc[10]['vector'])

0.6902863840386161