In [1]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers

In [2]:
# Elasticsearch nesnesi yaratmak
es = Elasticsearch("localhost:9200")

In [3]:
# dataframe yaratmak
df = pd.read_csv("D:/Datasets/spark_book_data/retail-data/all/online-retail-dataset.csv")

In [4]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null object
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [6]:
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      406829 non-null object
StockCode      406829 non-null object
Description    406829 non-null object
Quantity       406829 non-null int64
InvoiceDate    406829 non-null object
UnitPrice      406829 non-null float64
CustomerID     406829 non-null float64
Country        406829 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 27.9+ MB


# Index ve mappings oluştur

In [8]:
online_retail_index =  {
  "settings": {
    "index": {
      "analysis": {
        "analyzer": {
          "custom_analyzer":
          {
            "type":"custom",
            "tokenizer":"standard",
            "filter":[
              "lowercase", "custom_edge_ngram","asciifolding"
            ]
          }
        },
        "filter": {
          "custom_edge_ngram": {
            "type": "edge_ngram",
            "min_gram":2,
            "max_gram": 10
            }
          }
        }
      }
    },
    "mappings": {
    "properties": {
      "InvoiceNo":    { "type": "keyword" },  
      "StockCode":  { "type": "keyword"  }, 
      "Description":   { "type": "text"  },
      "Quantity": {"type": "integer"},
      "InvoiceDate": {
        "type":   "date",
        "format": "MM/dd/yyyy hh:ss"
      },
      "UnitPrice": {"type": "float"},
      "CustomerID": {"type": "keyword"},
      "Country": {"type": "keyword"}
    }
  }
  }

In [9]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


## Dataframe'i elasticsearh'e aktaran fonksiyon

In [10]:
def dataframe_to_es(df, es_index):
    for df_idx, line in df.iterrows():
        yield {
            "_index": es_index,
            "_id":df_idx,
            "_source" : {
                "InvoiceNo": line[0],
                "StockCode": line[1],
                "Description": line[2],
                "Quantity": line[3],
                "InvoiceDate": line[4],
                "UnitPrice": line[5],
                "CustomerID": line[6],
                "Country": line[7]
            }
        }

In [11]:
try:
    es.indices.delete("online_retail_python")
except:
    print("No index")

No index


In [None]:
helpers.bulk(es, dataframe_to_es(df, "online_retail_python"), raise_on_error=False)

# Search

In [27]:
keyword = "Coffee"
res = es.search(index='online_retail_python', body={
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "Description": keyword
                    }
                }
            ]
        }
    }
    
})

In [28]:
res['hits']['max_score']

5.77428

In [29]:
res['hits']['hits'][:4]

[{'_index': 'online_retail_python',
  '_type': '_doc',
  '_id': '9103',
  '_score': 5.77428,
  '_source': {'InvoiceNo': '537192',
   'StockCode': '20748',
   'Description': 'KENSINGTON COFFEE SET',
   'Quantity': 1,
   'InvoiceDate': '12/5/2010 13:42',
   'UnitPrice': 12.75,
   'CustomerID': 16402.0,
   'Country': 'United Kingdom'}},
 {'_index': 'online_retail_python',
  '_type': '_doc',
  '_id': '14891',
  '_score': 5.77428,
  '_source': {'InvoiceNo': '537624',
   'StockCode': '20748',
   'Description': 'KENSINGTON COFFEE SET',
   'Quantity': 1,
   'InvoiceDate': '12/7/2010 14:41',
   'UnitPrice': 12.75,
   'CustomerID': 12748.0,
   'Country': 'United Kingdom'}},
 {'_index': 'online_retail_python',
  '_type': '_doc',
  '_id': '22385',
  '_score': 5.77428,
  '_source': {'InvoiceNo': '538167',
   'StockCode': '20748',
   'Description': 'KENSINGTON COFFEE SET',
   'Quantity': 1,
   'InvoiceDate': '12/9/2010 18:58',
   'UnitPrice': 12.75,
   'CustomerID': 14713.0,
   'Country': 'United Ki