In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [19]:
article = [
    "Artificial intelligence is transforming industries.",
    "Machine learning is a subset of artificial intelligence.",
    "Cosine similarity is used in text analysis.",
    "Natural language processing is a branch of AI."
]
question = "How AI can help me in my business?"

In [20]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(article + [question])

In [21]:
vectors

<5x26 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [22]:
similarities = cosine_similarity(vectors[-1], vectors[:-1]).flatten()

In [23]:
similarities

array([0.        , 0.        , 0.09860142, 0.10161799])

In [24]:
best_match_index = similarities.argmax()
best_match_score = similarities[best_match_index]

In [25]:
best_match_index

3

In [26]:
best_match_score

0.10161798834216307

In [27]:
article[best_match_index]

'Natural language processing is a branch of AI.'

## Let's make it on WikiPedia Articles

In [28]:
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')


In [29]:
doc = wiki.page('Steve_Jobs').text
paragraphs = doc.split('\n\n') # chunking

In [31]:
paragraphs[0]

'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology company Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing that same year. In 1974, he traveled through India, seeking enlightenment before later studying Zen Buddhism. He and Wozniak co-founded Apple in 1976 to further develop and sell Wozniak\'s Apple I personal computer. Together, the duo gained fame and wealth a year later with production and sale of the Apple II, one of the first highly successful mass-produced microcomputers. \nJobs saw the commercial potential of the Xerox Alto in 1979, which was mouse-driven and 

In [50]:
question = "When did he started Apple?"

In [51]:
vectors = vectorizer.fit_transform(paragraphs + [question])

In [52]:
vectors

<48x2910 sparse matrix of type '<class 'numpy.float64'>'
	with 6921 stored elements in Compressed Sparse Row format>

In [53]:
similarities = cosine_similarity(vectors[-1], vectors[:-1]).flatten()

In [54]:
similarities

array([0.09564955, 0.04831651, 0.1115483 , 0.07291738, 0.05772331,
       0.10484042, 0.0330042 , 0.06915567, 0.07882996, 0.09755256,
       0.11337847, 0.08085723, 0.06351638, 0.05950638, 0.03112634,
       0.        , 0.04167541, 0.09231163, 0.11277567, 0.03701486,
       0.0735121 , 0.05379384, 0.04781071, 0.08467159, 0.09714704,
       0.05134866, 0.0657107 , 0.05722157, 0.06183552, 0.02192468,
       0.01769734, 0.10190518, 0.0559636 , 0.00867931, 0.01256783,
       0.00659831, 0.01326202, 0.06651871, 0.1199407 , 0.06923763,
       0.14288621, 0.08361029, 0.00712314, 0.        , 0.0146752 ,
       0.        , 0.00659565])

In [55]:
best_match_index = similarities.argmax()
best_match_score = similarities[best_match_index]

In [56]:
best_match_index

40

In [57]:
best_match_score

0.1428862110407237

In [58]:
paragraphs[best_match_index]

'I also bear the responsibility for being away from my daughter when she was four years old, as her mother divorced me when I went to Syria, but we got back in touch after 10 years. We lost touch again when her mother moved and I didn\'t know where she was, but since 10 years ago we\'ve been in constant contact, and I see her three times a year. I organized a trip for her last year to visit Syria and Lebanon and she went with a relative from Florida.\nA few years later, Schieble married an ice-skating teacher, George Simpson. Mona Jandali took her stepfather\'s last name, as Mona Simpson. In 1970, after divorcing her second husband, Schieble took Mona to Los Angeles and raised her alone.\nWhen Simpson found that their father, Abdulfattah Jandali, was living in Sacramento, California, Jobs had no interest in meeting him as he believed Jandali did not treat his children well and according to the San Francisco Chronicle, this was because of finding a Seattle Times article about Jandali\'s

## The method

In [59]:
def similarity_finder(question, doc, top_n=3):
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(doc + [question])
    
    similarities = cosine_similarity(vectors[-1], vectors[:-1]).flatten()
    
    top_n = min(top_n, len(article))
    best_match_indices = np.argsort(similarities)[-top_n:][::-1]
    
    return [(doc[i], similarities[i]) for i in best_match_indices]

## Testing on Persian Wikipedia

In [81]:
wiki = Wikipedia("Misano/0.0", "fa")

In [127]:
doc = wiki.page("استیو_جابز").text

In [128]:
doc = doc.split('\n\n')

## Testing in RAG

In [85]:
from openai import OpenAI

client = OpenAI(api_key="PinkOrca", base_url="https://openai.jabirproject.org/v1")

In [129]:
question = "در چه سالی اولین آیفون را معرفی کرد؟"

In [130]:
context = similarity_finder(question, doc, 10)

context_doc = []

for c in context:
    context_doc.append(c[0])

context_doc = '\n'.join(context_doc)

In [131]:
print(context_doc)

افتخارات
تایم در سال ۱۹۸۳ میلادی، جابز را به‌عنوان «مشهورترین استاد میکرو (الکترونیک)» معرفی کرده بود.
در سال ۱۹۸۵ رئیس‌جمهور رونالد ریگان، مدال ملی فناوری و نوآوری را به استیو جابز و استیو وازنیک به‌خاطر معرفی و گسترش رایانه شخصی اهدا کرده بود. آنها در میان اولین افرادی بودند که موفق به کسب این افتخار شده‌اند. در سال ۲۰۰۷، جابز به‌وسیلهٔ مجله فورچون در یک لیست ۲۵نفره به‌عنوان قدرتمندترین فرد و بالاتر از افرادی چون بیل گیتس، روپرت مرداک و اریک اشمیت در تجارت معرفی شد و جیمز کالینز (مؤلف و سخنران در زمینه تجارت) او را «بتهوون تجارت» خواند. در دسامبر ۲۰۰۷، آرنولد شوارزنگر فرماندار وقت کالیفرنیا و بانوی اول ماریا شرایور، افتخار ورود به «تالار مشاهیر کالیفرنیا» که در موزهٔ تاریخ و زنان و هنر کالیفرنیا واقع است، را نصیب استیو جابز کردند.
در اوت سال ۲۰۰۹ میلادی، در نظرسنجی جونیور اچیومنت، او به‌عنوان مورد‌تحسین‌ترین کارآفرین در میان نوجوانان انتخاب شد. در نوامبر ۲۰۰۹، به‌وسیلهٔ مجلهٔ فورچون به‌عنوان «مدیرعامل دهه» انتخاب شد. در سال ۲۰۱۰، در ردهٔ هفدهم قدرتمندترین افراد جهان از دیدگاه فوربز ق

In [132]:
user_message = f'''Here we have provided a context and a question. Your task is to ONLY look at the context and provide answer to the asked question based on that the context provided.
If you can't find an answer in the context, basically say you couldn't find anything and DO NOT try to make the answer up.

CONTEXT = {context_doc}\n
QUESTION = {question}
'''

In [133]:
completion = client.chat.completions.create(
    model = "jabir-400b",
    messages = [
        {
            "role" : "user",
            "content" : user_message
        }
    ],
    temperature = 0
)

In [134]:
answer = completion.choices[0].message.content

In [135]:
print(answer)

اولین آی‌فون در ژانویهٔ ۲۰۰۷ معرفی شد.
