# Text Embedding Preprocessing

## Imports and data loading

In [None]:
from itertools import zip_longest
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

#HF sentence-transformers resource: https://huggingface.co/sentence-transformers

In [None]:
df = pd.read_csv("../data/interim/racquets_trimmed.csv", index_col = 0)

racquet_brand = [row.split(" ")[0] for row in df["racquet_name"]]

df.insert(1, "racquet_brand", racquet_brand)

df

## Combining text columns for embedding

In [None]:
object_cols = df.select_dtypes(include = ["object"]).columns.to_list()
numeric_cols = df.select_dtypes(include = [np.number]).columns.to_list()

print(pd.DataFrame(
    list(zip_longest(object_cols, numeric_cols, fillvalue=None)), 
    columns = ["Object Columns", "Numeric Columns"]
    )
      )

In [None]:
df["combined_text"] = ""

### 

In [None]:
def combine_obj_cols(df:pd.DataFrame, object_cols:list[str]) -> pd.Series:
    _df = df.copy()
    _df["combined_col"] = ""
    
    _replacements = str.maketrans({
        "!":".",
        "_":" ",
        "&":"and",
        "²":"",
        "\xa0":"",
        "\n":"",
        "\r":"",
        '"':"",
        '“':"",
        "+":"plus",
        "%":"percent"
    })
    
    _title_dict = {
        "racquet_brand":"Racquet Brand",
        "racquet_name":"Racquet Name",
        "racquet_desc":"Racquet Description",
        "racquet_composition":"Racquet Composition",
        "racquet_power":"Racquet Power Level",
        "racquet_stroke_style":"Racquet Stroke Style",
        "racquet_swing_speed":"Racquet Swing Speed",
        "racquet_colors":"Racquet Colors",
        "racquet_grip_type":"Racquet Grip Type"
    }
    
    for col in object_cols:
        _content = _df[col].str.translate(_replacements)
        _content = _content.replace("in²", "inches squared").replace("  ", " ")
        _df["combined_col"] += _title_dict[col] + ": " +  _content + "\n"
        
    return _df["combined_col"]

In [None]:
df["combined_text"] = combine_obj_cols(df, object_cols[1:len(object_cols)])

In [None]:
df[["racquet_brand","racquet_name", "racquet_desc", "combined_text"]]

In [None]:
import collections
from itertools import chain


char_counts = collections.Counter(chain.from_iterable(df['combined_text'].dropna()))
print(char_counts)

## Test Embedding Process

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
texts = [df["combined_text"][0],
         df["combined_text"][1],
         df["combined_text"][2],
         df["combined_text"][374]]
texts

In [None]:
embeddings = model.encode(texts)
print(embeddings.shape)

In [None]:
similarities = model.similarity(embeddings, embeddings)
print(similarities) # First three should be more similar to each other than the fourth