<a href="https://colab.research.google.com/github/robgon-art/ai8ball/blob/main/AI_8_Ball.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **AI 8-Ball**
![AI 8-Ball](https://raw.githubusercontent.com/robgon-art/ai8ball/main/ai8ball_med.jpg)</br>
The AI 8-Ball can answer yes/no questions by using the power of the Internet and Machine Learning.</br>

Please initialize the system by hitting the first Run cell button below. It takes 3 to 5 minutes to set up.

In [None]:
#@title Initialize the System
!pip install transformers wikipedia pynytimes jsonlines pytextrank spacy==2.2

!gsutil cp gs://boolq/train.jsonl .
!gsutil cp gs://boolq/dev.jsonl .
!gsutil cp gs://boolq/test.jsonl .
!wget -O answers.pkl https://github.com/robgon-art/ai8ball/raw/main/answers.pkl
!wget -O roberta-large_fine-tuned.zip --no-check-certificate "https://onedrive.live.com/download?cid=61FC7243E093B36A&resid=61FC7243E093B36A%211286&authkey=ALT4NEljIrPRmHk"
!wget -O encoded_qs.pkl --no-check-certificate "https://onedrive.live.com/download?cid=61FC7243E093B36A&resid=61FC7243E093B36A%211287&authkey=ACbwVMsjvMecSls"
!unzip roberta-large_fine-tuned.zip -d roberta-large_fine-tuned

import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import tensorflow as tf
import tensorflow_hub as hub
from codecs import decode, encode
import math
from sklearn import preprocessing
import pickle
import jsonlines
from scipy import spatial
import spacy.cli
import pytextrank
import spacy
import pickle

encoding_module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
encoding_model = hub.load(encoding_module_url)
def d(t): return decode(t,"base-64").decode()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
model_path = "roberta-large_fine-tuned"
print("Loading tokenizer from " + model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Loading model from " + model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
_ = model.to(device)

def predict(question, passage):
  if len(question) == 0 or len(passage) == 0:
    return 0.5, 0.0
  sequence = tokenizer.encode_plus(question, passage, return_tensors="pt",
    max_length=512, truncation=True)['input_ids'].to(device)
  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  vector = logits.detach().cpu().tolist()[0]
  confidence = min(math.sqrt(vector[0]**2+vector[1]**2)/3.6, 1)
  proba_yes = probabilities[1]
  confidence = round(confidence, 3)
  return proba_yes, confidence

def predict_and_print(question, passage):
  proba_yes, confidence = predict(question, passage)
  print(f"Question: {question}, Yes: {round(proba_yes,3)}, No: {round(1-proba_yes, 3)}, Confidence {confidence}")
  return proba_yes, confidence

spacy_model = "en_core_web_md"
print("Downloading " + spacy_model)
spacy.cli.download(spacy_model)

boolq_data = []
for file_name in ["dev.jsonl", "test.jsonl", "train.jsonl"]:
  with jsonlines.open(file_name) as file:
      for line in file.iter():
        boolq_data.append(line)
def embed(input):
  e = encoding_model([input])[0]
  proto_tensor = tf.make_tensor_proto(e)
  a = tf.make_ndarray(proto_tensor)
  return a.tolist()
encoded_qs = pickle.load(open("encoded_qs.pkl", "rb"))
question_tree = spatial.KDTree(encoded_qs)

nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
answers = pickle.load(open("answers.pkl", "rb"))

Next, choose one of the sample questions by clicking on the down-arrow, or ask one of your own by typing it in. And then hit the second Run cell button to see the answer.

In [15]:
question = "Can a computer beat a grandmaster chess player?" #@param ["Can a computer beat a grandmaster chess player?", "Can fortunetellers actually predict the future?", "Is global warming caused by humans?", "Is Schitt's Creek a good show?", "Is it true that 'kangaroo' means 'I don't know'?"] {allow-input: true}

query = ""
keywords = nlp(question.lower())
for p in keywords._.phrases[:5]:
  if p.rank > 0.01:
    query += p.text + " "
query = query.strip()
if len(query) == 0:
  query = question

import wikipedia
print("Checking the Wikipedia.")
results = wikipedia.search(query, results = 3)
wiki_passage = ""
for r in results[1:]:
  try:
    s = wikipedia.summary(r)
  except:
    continue
  wiki_passage += s.strip() + " "
wiki_passage = wiki_passage.replace("\n", " ")
wiki_yes, wiki_conf = predict(question, wiki_passage)

from pynytimes import NYTAPI
nyt_passage = ""
print("Checking the New York Times.")
nyt = NYTAPI(d(b'R2xsdTF4S2lLMjdSc3dBOXZ0VkZwSjMxbmoyS1RjVzM=\n'))
articles = nyt.article_search(query = query, results = 3,
  options = {"sort": "relevance"})
for a in articles[:3]:
  nyt_passage += a["abstract"].strip() + " "
  nyt_passage += a["lead_paragraph"].strip() + " "
nyt_yes, nyt_conf = predict(question, nyt_passage)

from scipy import spatial
boolq_passage = ""
print("Checking the BoolQ Dataset.\n")
question_embed = embed(question)
result = question_tree.query(question_embed)
if (result[0] < 1):
  index = result[1]
  boolq_passage = boolq_data[result[1]]["passage"]
  similar_question = boolq_data[result[1]]["question"]
boolq_yes, boolq_conf = predict(question, boolq_passage)

conf = 0
yes = 0.5
passage = ""
source = "no source"

if (wiki_conf > nyt_conf and wiki_conf > boolq_conf):
  yes = wiki_yes
  conf = wiki_conf
  passage = wiki_passage
  source = "Wikipedia"
else:
  if (nyt_conf > boolq_conf):
    yes = nyt_yes
    conf = nyt_conf
    passage = nyt_passage
    source = "New York Times"
  else:
    yes = boolq_yes
    conf = boolq_conf
    passage = boolq_passage
    source = "BoolQ Dataset"

import textwrap

min_dist = float("inf")
pick = 0

if (conf < 0.5):
  map_conf =  1 +  conf * 19 / 2
  for i, a in enumerate(answers[:5]):
    c = a[1][1]
    distance = abs(map_conf-c)
    if distance < min_dist:
      min_dist = distance
      pick = i
else:
  map_yes = 1 + (yes * 1.5 if yes > 2/3 else yes) * 19 / 1.5
  for i, a in enumerate(answers[5:]):
    y = a[1][0]
    distance = abs(map_yes-y)
    if distance < min_dist:
      min_dist = distance
      pick = i+5

print("The AI 8-Ball's Answer:", answers[pick][0], "\n")
print("Yes:", str(round(yes*100, 2)) + "%")
print("No:", str(round((1-yes)*100, 2)) + "%")
print("Confidence:", str(round(conf*100, 2)) + "%")
print("Source:", source)

print(textwrap.fill("Passage: " + passage, width=150))

Checking the Wikipedia.
Checking the New York Times.
Checking the BoolQ Dataset.

The AI 8-Ball's Answer: It is certain. 

Yes: 99.52%
No: 0.48%
Confidence: 100%
Source: BoolQ Dataset
Passage: Chess programs running on commercially-available desktop computers had convincing victories against human players in matches in 2005 and
2006. Since that time, chess programs running on commercial hardware - more recently including mobile phones - have been able to defeat even the
strongest human players.


[![Wikipedia logo](https://raw.githubusercontent.com/robgon-art/ai8ball/main/wikipedia_ai_logo.png)](https://en.wikipedia.org)
[![Data Provided by New York Times logo](https://raw.githubusercontent.com/robgon-art/ai8ball/main/poweredby_nytimes_200a.png)]( https://developer.nytimes.com)
[![Google AI logo](https://raw.githubusercontent.com/robgon-art/ai8ball/main/google_ai_logo.png)](https://ai.google)