In [None]:
from langchain.docstore.document import Document

In [None]:
from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()

In [None]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [None]:
import pandas as pd
data = pd.read_csv('data/winemag-data_first150k.csv')
data = data.head(1000)
data = data.drop(data.columns[0], axis=1)


In [None]:
docs = []
for i in range(len(data)):
    meta_data = {
        'country':data['country'][i],
        'variety':data['variety'][i]
    }
    page_content = data['description'][i]
    doc = Document(
        metadata=meta_data,
        page_content=page_content
    )
    docs.append(doc)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_kwargs={'device':0})

In [None]:
from langchain.vectorstores import Chroma

In [None]:
!rm -rf temp
!mkdir temp

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory='temp'
)

In [None]:
docs[0]

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo
meta_data_description = [
    AttributeInfo(
        name='country',
        description='country or region the wine is from',
        type='string'
    ),
    AttributeInfo(
        name='variety',
        description='type or grape or grape blend of wine',
        type='string'
    )
]

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [None]:
from langchain.llms import HuggingFaceHub
document_content_description = "wine reviews"
llm = HuggingFaceHub(repo_id='google/flan-t5-xxl')

In [None]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    meta_data_description,
    verbose=True
)

In [None]:
question = "what is said about Merlot"

In [None]:
retriever.get_relevant_documents(question)

In [None]:
from langchain.llms import Accelerate
model_name = "facebook/opt-30b"
FastLLM = Accelerate.from_model_name(model_name=model_name)
print(FastLLM("Hello World"))

In [None]:
from langchain.llms import Ac

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [None]:
import pandas as pd
data = pd.read_csv('data/winemag-data_first150k.csv')

In [None]:
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample string
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the string into words
words = word_tokenize(text)

# Perform part-of-speech tagging to get POS labels
pos_tags = pos_tag(words)

# Filter out adjectives from the tagged words
adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]

print(adjectives)

In [None]:
temp_data = data.head(100000)
temp_data['variety'] = temp_data['variety'].str.lower()
temp_data['country'] = temp_data['country'].str.lower()

temp_combined_data = temp_data.groupby(['variety', 'country'])['description'].agg(lambda x: ' '.join(x))

In [None]:
pd.unique(temp_combined_data.reset_index()[['variety','country']].values.ravel())

In [None]:
def get_adjectives(words):
    tokens = word_tokenize(words)
    pos_tags = [word[0] for word in pos_tag([word.lower() for word in tokens]) if word[1] == "JJ"]
    return pos_tags

temp_adj = temp_combined_data.agg(lambda x: get_adjectives(x)).reset_index()


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

user_input_variety = 'chardonnay'
user_input_country = 'south africa'
user_input_adj = temp_adj[(temp_adj['variety']==user_input_variety) & 
                          (temp_adj['country']==user_input_country)]
all_user_input_adj = user_input_adj['description'].explode().reset_index()
top_user_input_adj = all_user_input_adj.value_counts().head(10)

# Get all the indexes for the top n terms by frequency
top_n_index = all_user_input_adj.index[all_user_input_adj['description'].isin(top_user_input_adj.reset_index()['description'])]
# Sentences are encoded by calling model.encode()
user_input_emb = model.encode(all_user_input_adj['description'])
# top_user_input_emb = model.encode(top_user_input_adj.index)

# Get all adjectives
all_adj = list(set(temp_adj['description'].explode().to_list()))
all_adj_emb = model.encode(all_adj)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca = PCA(n_components=3)
all_adj_embed_3d = pca.fit_transform(all_adj_emb)
user_input_emb_3d = pca.fit_transform(user_input_emb)


In [None]:
import plotly.graph_objects as go

x_coords, y_coords, z_coords = zip(*all_adj_embed_3d)
selected_x_coords, selected_y_coords, selected_z_coords = zip(*user_input_emb_3d)
# Get coords for text
text_x_coords = [selected_x_coords[n] for n in top_n_index.to_list()]
text_y_coords = [selected_y_coords[n] for n in top_n_index.to_list()]
text_z_coords = [selected_z_coords[n] for n in top_n_index.to_list()]
text_labels = all_user_input_adj['description'][all_user_input_adj.index.isin(top_n_index)]



# Find the minimum and maximum values for each axis
x_min, x_max = min(x_coords), max(x_coords)
y_min, y_max = min(y_coords), max(y_coords)
z_min, z_max = min(z_coords), max(z_coords)

# Extend the axis range by 10%
x_range = (x_min - (x_max - x_min) * 0.25, x_max + (x_max - x_min) * 0.25)
y_range = (y_min - (y_max - y_min) * 0.25, y_max + (y_max - y_min) * 0.25)
z_range = (z_min - (z_max - z_min) * 0.25, z_max + (z_max - z_min) * 0.25)


# Create 3D scatter plot
fig = go.Figure()
all_trace = go.Scatter3d(x=x_coords, y=y_coords, z=z_coords, mode='markers', hoverinfo='none')
fig.add_trace(all_trace)
fig.update_traces(marker=dict(color='lightgrey', opacity=0.05))
selected_trace = go.Scatter3d(x=selected_x_coords, y=selected_y_coords, z=selected_z_coords, mode='markers',
                              text=all_user_input_adj['description'], hoverinfo='text'
)
fig.add_trace(selected_trace)
selected_text = go.Scatter3d(x=text_x_coords, y=text_y_coords, z=text_z_coords, mode='text', text=text_labels, hoverinfo='none')
fig.add_trace(selected_text)
fig.update_layout(
    title="",
    scene=dict(
        xaxis=dict(range=x_range, showbackground=False, showgrid=False, showline=False, zeroline=True, showticklabels=False, showspikes=False),
        yaxis=dict(range=y_range, showbackground=False, showgrid=False, showline=True, zeroline=True, showticklabels=False, showspikes=False),
        zaxis=dict(range=z_range, showbackground=False, showgrid=False, showline=True, zeroline=True, showticklabels=False, showspikes=False),
                xaxis_title='',
        yaxis_title='',
        zaxis_title='',
    ),
    margin=dict(
        b=0,
        t=0,
        l=0,
        r=0
    ),
    showlegend=False
)

fig.show()

In [None]:
[adj for adj in all_user_input_adj['description']]