## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/quran.txt"
file_type = "text"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

value
1|1|بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1|2|الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
1|3|الرَّحْمَٰنِ الرَّحِيمِ
1|4|مَالِكِ يَوْمِ الدِّينِ
1|5|إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
1|6|اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ
1|7|صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ
2|1|بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ الم
2|2|ذَٰلِكَ الْكِتَابُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى لِلْمُتَّقِينَ
2|3|الَّذِينَ يُؤْمِنُونَ بِالْغَيْبِ وَيُقِيمُونَ الصَّلَاةَ وَمِمَّا رَزَقْنَاهُمْ يُنْفِقُونَ


In [0]:
# Create a view or table

temp_table_name = "quran_txt"

df.createOrReplaceTempView(temp_table_name)

In [0]:
# Jaccard similarity coefficient, treats the data objects like sets 
#Size of the intersection of the two sets: 3

#Size of the union of the two sets: 1+3+3 = 7

#Using the Jaccard index, we get a similarity score of 3/7 = 0.42

In [0]:
def jaccard_similarity(x,y):
  """ returns the jaccard similarity between two lists """
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)
sentences = ["الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ ﴿2﴾ الرَّحْمَٰنِ الرَّحِيمِ ﴿3﴾ مَالِكِ يَوْمِ الدِّينِ ﴿4﴾",
" اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ ﴿6﴾ صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ ﴿7﴾"]
sentences = [sent.lower().split(" ") for sent in sentences]
jaccard_similarity(sentences[0], sentences[1])

# OUPUT
0.42857142857142855

Out[19]: 0.42857142857142855

In [0]:
#i used here the TF-IDF 
#the TF  measures the number of times a term (word) occurs in a document.
#i used some words from the quran


In [0]:
import math
import pandas as pd
import numpy as np
#documents
doc1 = "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ"
doc2 = "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ"
doc3 = "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ"
#query string
query = "رَبِّ"


In [0]:
#term -frequenvy :word occurences in a document
def compute_tf(docs_list):
    for doc in docs_list:
        doc1_lst = doc.split(" ")
        wordDict_1= dict.fromkeys(set(doc1_lst), 0)

        for token in doc1_lst:
            wordDict_1[token] +=  1
        df = pd.DataFrame([wordDict_1])
        idx = 0
        new_col = ["Term Frequency"]    
        df.insert(loc=idx, column='Document', value=new_col)
        print(df)
        
compute_tf([doc1, doc2, doc3])

         Document  الْعَالَمِينَ  الْحَمْدُ  رَبِّ  لِلَّهِ
0  Term Frequency              1          1      1        1
         Document  الْعَالَمِينَ  الْحَمْدُ  رَبِّ  لِلَّهِ
0  Term Frequency              1          1      1        1
         Document  الَّذِينَ  أَنْعَمْتَ  الْمَغْضُوبِ  وَلَا  الضَّالِّينَ  \
0  Term Frequency          1           1             1      1             1   

   صِرَاطَ  عَلَيْهِمْ  غَيْرِ  
0        1           2       1  


In [0]:
#Normalized Term Frequency
def termFrequency(term, document):
    normalizeDocument = document.lower().split()
    return normalizeDocument.count(term.lower()) / float(len(normalizeDocument))

def compute_normalizedtf(documents):
    tf_doc = []
    for txt in documents:
        sentence = txt.split()
        norm_tf= dict.fromkeys(set(sentence), 0)
        for word in sentence:
            norm_tf[word] = termFrequency(word, txt)
        tf_doc.append(norm_tf)
        df = pd.DataFrame([norm_tf])
        idx = 0
        new_col = ["Normalized TF"]    
        df.insert(loc=idx, column='Document', value=new_col)
        print(df)
    return tf_doc

tf_doc = compute_normalizedtf([doc1, doc2, doc3])

        Document  الْعَالَمِينَ  الْحَمْدُ  رَبِّ  لِلَّهِ
0  Normalized TF           0.25       0.25   0.25     0.25
        Document  الْعَالَمِينَ  الْحَمْدُ  رَبِّ  لِلَّهِ
0  Normalized TF           0.25       0.25   0.25     0.25
        Document  الَّذِينَ  أَنْعَمْتَ  الْمَغْضُوبِ     وَلَا  الضَّالِّينَ  \
0  Normalized TF   0.111111    0.111111      0.111111  0.111111      0.111111   

    صِرَاطَ  عَلَيْهِمْ    غَيْرِ  
0  0.111111    0.222222  0.111111  


In [0]:
def inverseDocumentFrequency(term, allDocuments):
    numDocumentsWithThisTerm = 0
    for doc in range (0, len(allDocuments)):
        if term.lower() in allDocuments[doc].lower().split():
            numDocumentsWithThisTerm = numDocumentsWithThisTerm + 1
 
    if numDocumentsWithThisTerm > 0:
        return 1.0 + math.log(float(len(allDocuments)) / numDocumentsWithThisTerm)
    else:
        return 1.0
    
def compute_idf(documents):
    idf_dict = {}
    for doc in documents:
        sentence = doc.split()
        for word in sentence:
            idf_dict[word] = inverseDocumentFrequency(word, documents)
    return idf_dict
idf_dict = compute_idf([doc1, doc2, doc3])

compute_idf([doc1, doc2, doc3])

Out[17]: {'الْحَمْدُ': 1.4054651081081644,
 'لِلَّهِ': 1.4054651081081644,
 'رَبِّ': 1.4054651081081644,
 'الْعَالَمِينَ': 1.4054651081081644,
 'صِرَاطَ': 2.09861228866811,
 'الَّذِينَ': 2.09861228866811,
 'أَنْعَمْتَ': 2.09861228866811,
 'عَلَيْهِمْ': 2.09861228866811,
 'غَيْرِ': 2.09861228866811,
 'الْمَغْضُوبِ': 2.09861228866811,
 'وَلَا': 2.09861228866811,
 'الضَّالِّينَ': 2.09861228866811}

In [0]:
#Given below is TF * IDF calculations for رَبِّ in all the documents.

In [0]:
# tf-idf score across all docs for the query string("life learning")
def compute_tfidf_with_alldocs(documents , query):
    tf_idf = []
    index = 0
    query_tokens = query.split()
    df = pd.DataFrame(columns=['doc'] + query_tokens)
    for doc in documents:
        df['doc'] = np.arange(0 , len(documents))
        doc_num = tf_doc[index]
        sentence = doc.split()
        for word in sentence:
            for text in query_tokens:
                if(text == word):
                    idx = sentence.index(word)
                    tf_idf_score = doc_num[word] * idf_dict[word]
                    tf_idf.append(tf_idf_score)
                    df.iloc[index, df.columns.get_loc(word)] = tf_idf_score
        index += 1
    df.fillna(0 , axis=1, inplace=True)
    return tf_idf , df
            
documents = [doc1, doc2, doc3]
tf_idf , df = compute_tfidf_with_alldocs(documents , query)
print(df)

   doc     رَبِّ
0    0  0.351366
1    1  0.351366
2    2  0.000000
