In [1]:
print('Gajraj')

Gajraj


### Base FAISS Flow

In [None]:
# !pip install faiss-cpu sentence-transformers
# !pip install pandas numpy matplotlib

In [None]:
import pandas as pd
data = [['Where are your headquarters located?', 'location'],
['Throw my cellphone in the water', 'random'],
['Network Access Control?', 'networking'],
['Address', 'location']]
df = pd.DataFrame(data, columns = ['text', 'category'])

In [None]:
from sentence_transformers import SentenceTransformer
text = df['text']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(text)

In [None]:
import faiss

vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [None]:
import numpy as np

search_text = 'where is your office?'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [None]:
k = index.ntotal
distances, ann = index.search(_vector, k=k)

In [None]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})

In [None]:
# join by: df1.ann == data.index 
merge = pd.merge(results, df, left_on='ann', right_index=True)
merge

In [None]:
labels  = df['category']
category = labels[ann[0][0]]
category

### txtai Setup

In [None]:
# !pip install git+https://github.com/neuml/txtai
!pip install git+https://github.com/neuml/codequestion

In [None]:
from txtai.embeddings import Embeddings
# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

In [None]:
data = ["US tops 5 million confirmed virus cases",
        "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
        "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
        "The National Park Service warns against sacrificing slower friends in a bear attack",
        "Maine man wins $1M from $25 lottery ticket",
        "Make huge profits without work, earn up to $100,000 a day"]

print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)

for query in ("feel good story", "climate change", "public health story", "war", "wildlife", "asia", "lucky", "dishonest junk"):
    # Get index of best section that best matches query
    uid = embeddings.similarity(query, data)[0][0]

    print("%-20s %s" % (query, data[uid]))

In [None]:
embeddings.index([(uid, text, None) for uid, text in enumerate(data)])

print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)

for query in ("feel good story", "climate change", "public health story", "war", "wildlife", "asia", "lucky", "dishonest junk"):
    uid = embeddings.search(query, 1)[0][0]
    print("%-20s %s" % (query, data[uid]))

In [None]:
embeddings.save("index.bin")

embeddings = Embeddings()
embeddings.load("index.bin")

uid = embeddings.search("climate change", 1)[0][0]
print(data[uid])

In [None]:
# Run initial query
uid = embeddings.search("feel good story", 1)[0][0]
print("Initial: ", data[uid])
# Create a copy of data to modify
udata = data.copy()
udata[0] = "See it: baby panda born"
embeddings.upsert([(0, udata[0], None)])
uid = embeddings.search("feel good story", 1)[0][0]
print("After update: ", udata[uid])

# Remove record just added from index
embeddings.delete([0])

# Ensure value matches previous value
uid = embeddings.search("feel good story", 1)[0][0]
print("After delete: ", udata[uid])

In [None]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", "content": True, "objects": True})

# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data)])

print(embeddings.search("dishonest junk", 1)[0])

In [None]:
# Create an index for the list of text
embeddings.index([(uid, {"text": text, "length": len(text)}, None) for uid, text in enumerate(data)])

# Filter by score
print(embeddings.search("select text, score from txtai where similar('hiking danger') and score >= 0.15"))

# Filter by metadata field 'length'
print(embeddings.search("select text, length, score from txtai where similar('feel good story') and score >= 0.05 and length >= 40"))

# Run aggregate queries
print(embeddings.search("select count(*), min(length), max(length), sum(length) from txtai"))

In [None]:
import urllib

from IPython.display import Image

# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")

# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", "object": request.read()}, None)])

# Query txtai for the most similar result to "machine learning" and get associated object
result = embeddings.search("select object from txtai where similar('machine learning') limit 1")[0]["object"]

# Display image
Image(result.getvalue(), width=600)

### txtai Other features

In [None]:
class Student:
    def __init__(self):
        print('Object Instantiated')
    def __call__(self,firstName):
        print(f'Object is called here with {firstName}')

obj=Student()
obj('Mantra')

In [None]:
import warnings
from txtai.workflow import Workflow, Task
workflow = Workflow([Task(lambda x: [y * 2 for y in x])])
list(workflow([1, 2, 3]))

In [None]:
#!pip install git+https://github.com/neuml/txtai#egg=txtai[graph,pipeline,similarity] datasets ipyplot

In [None]:
import networkx as nx

from txtai.graph import GraphFactory

# Create graph
graph = GraphFactory.create({"backend": "networkx"})
graph.initialize()

# Add nodes
nodes = [(0, "dog"), (1, "fox"), (2, "wolf"), (3, "zebra"), (4, "horse")]
labels = {uid:text for uid, text in nodes}
for uid, text in nodes:
  graph.addnode(uid, text=text)

# Add relationships
edges = [(0, 1, 1), (0, 2, 1), (1, 2, 1), (2, 3, 0.25), (3, 4, 1)]
for source, target, weight in edges:
  graph.addedge(source, target, weight=weight)

# Print centrality and path between 0 and 4
print("Centrality:", {labels[k]:v for k, v in graph.centrality().items()})
print("Path (dog->horse):", " -> ".join([labels[uid] for uid in graph.showpath(0, 4)]))

# Visualize graph
nx.draw(graph.backend, nx.shell_layout(graph.backend), labels=labels, with_labels=True,
        node_size=2000, node_color="#03a9f4", edge_color="#cfcfcf", font_color="#fff")

### On the fly NER

In [None]:
!pip install fsner
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf

In [None]:
import json
import warnings
from fsner import FSNERModel, FSNERTokenizerUtils, pretty_embed

query_texts = [
    "Does Luke's serve lunch?",
    "Chang does not speak Taiwanese very well.",
    "I like Berlin."
]

# Each list in supports are the examples of one entity type
# Wrap entities around with [E] and [/E] in the examples.
# Each sentence should have only one pair of [E] ... [/E]

support_texts = {
    "Restaurant": [
        "What time does [E] Subway [/E] open for breakfast?",
        "Is there a [E] China Garden [/E] restaurant in newark?",
        "Does [E] Le Cirque [/E] have valet parking?",
        "Is there a [E] McDonalds [/E] on main street?",
        "Does [E] Mike's Diner [/E] offer huge portions and outdoor dining?"
    ],
    "Language": [
        "Although I understood no [E] French [/E] in those days , I was prepared to spend the whole day with Chien - chien .",
        "like what the hell 's that called in [E] English [/E] ? I have to register to be here like since I 'm a foreigner .",
        "So , I 'm also working on an [E] English [/E] degree because that 's my real interest .",
        "Al - Jazeera TV station , established in November 1996 in Qatar , is an [E] Arabic - language [/E] news TV station broadcasting global news and reports nonstop around the clock .",
        "They think it 's far better for their children to be here improving their [E] English [/E] than sitting at home in front of a TV . \"",
        "The only solution seemed to be to have her learn [E] French [/E] .",
        "I have to read sixty pages of [E] Russian [/E] today ."
    ]
}

device = 'cpu'

tokenizer = FSNERTokenizerUtils("sayef/fsner-bert-base-uncased")
queries = tokenizer.tokenize(query_texts).to(device)
supports = tokenizer.tokenize(list(support_texts.values())).to(device)

model = FSNERModel("sayef/fsner-bert-base-uncased")
model.to(device)

p_starts, p_ends = model.predict(queries, supports)

# One can prepare supports once and reuse  multiple times with different queries
# ------------------------------------------------------------------------------
# start_token_embeddings, end_token_embeddings = model.prepare_supports(supports)
# p_starts, p_ends = model.predict(queries, start_token_embeddings=start_token_embeddings,
#                                  end_token_embeddings=end_token_embeddings)

output = tokenizer.extract_entity_from_scores(query_texts, queries, p_starts, p_ends,
                                              entity_keys=list(support_texts.keys()), thresh=0.50)

print(json.dumps(output, indent=2))

# install displacy for pretty embed
pretty_embed(query_texts, output, list(support_texts.keys()))

### Working with MindsDB

In [None]:
import pandas as pd

columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target']

df = pd.read_csv('/workspace/PythonExp/data/adult.data', header=None, names=columns)
df.target = df.target.map({' <=50K': 0, ' >50K': 1})
df.to_csv('/workspace/PythonExp/data/data.csv', index=False)


test = pd.read_csv('/workspace/PythonExp/data/adult.test', header=None, names=columns, skiprows=1)
test.target = test.target.map({' <=50K.': 0, ' >50K.': 1})
test.to_csv('/workspace/PythonExp/data/test.csv', index=False)


In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sn

In [None]:
predictions = pd.read_csv('/workspace/PythonExp/data/export.csv')
print(predictions.shape)

In [None]:
target = predictions.target
predicted_target = predictions.predicted_salary

cm = confusion_matrix(target, predicted_target)
(tp, fp), (fn, tn) = cm
ax = sn.heatmap(cm, annot=True, fmt='g')
ax.set_ylabel('Real')
ax.set_xlabel('Predicted')

In [None]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

### MLFlow Experiments

In [2]:
import mlflow

In [3]:
print(f'Version -->{mlflow.version.VERSION}')

Version -->1.29.0


In [5]:
with mlflow.start_run() as run:
    mlflow.log_param('gaj','param')
    mlflow.log_metric("score","100")

In [None]:
import os
from random import random, randint
from mlflow import log_metric, log_param, log_artifacts

if __name__ == "__main__":
    # Log a parameter (key-value pair)
    log_param("param1", randint(0, 100))

    # Log a metric; metrics can be updated throughout the run
    log_metric("foo", random())
    log_metric("foo", random() + 1)
    log_metric("foo", random() + 2)

    # Log an artifact (output file)
    if not os.path.exists("outputs"):
        os.makedirs("outputs")
    with open("outputs/test.txt", "w") as f:
        f.write("hello world!")
    log_artifacts("outputs")