In [56]:
try:
    import os
    import sys
    import elasticsearch

    from elasticsearch import Elasticsearch
    import pandas as pd

    from elasticsearch import helpers

    from pprint import pprint


    print("All Modules Loaded")

except Exception as e:
    print("Some Modules missing {}".format(e))

All Modules Loaded


# Connecting to ES

In [3]:
ELASTIC_PASSWORD = "-nllMUvtzVwlTjd7WL51"
CERT_FINGERPRINT = "A5:95:FB:52:FC:8C:BC:E5:2E:70:55:91:08:E4:78:E0:2E:D1:48:31:93:69:40:29:52:47:FF:48:95:12:6C:70"

In [4]:
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="/Users/saujanyatiwari/codes/elasticsearch-8.3.2/config/certs/http_ca.crt",
    http_auth = ("elastic", ELASTIC_PASSWORD),
)

In [5]:
es.ping()

True

# Cresting an index

In [6]:
es.indices.create(index="my-foo", ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-foo'}

In [7]:
res = es.indices.get_alias("")
for name in res:
    print(name)

.kibana-event-log-8.3.2-000001
.kibana_security_session_1
.kibana_task_manager_8.3.2_001
.apm-agent-configuration
.kibana_8.3.2_001
.security-7
person
my-foo
.tasks
kibana_sample_data_ecommerce
.apm-custom-link
.async-search




In [12]:
es.indices.delete(index="my-foo", ignore=[400, 404])

{'acknowledged': True}

# Upload to sample JSON DOC

In [13]:
saujanya = {
    "name": "saujanya",
    "company": "HyperVue AI",
    "post": "FullStack"
}

niti = {
    "name": "niti",
    "company": "Canara Bank",
    "post": "PO"
}

In [14]:
es.indices.create(index = "person")

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'person'}

In [15]:
saujanya_upload = es.index(
    index="person",
    # doc_type= "employee",
    body=saujanya,
    id=1
)

In [16]:
saujanya_upload

{'_index': 'person',
 '_id': '1',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [17]:
niti_upload = es.index(
    index="person",
    body=niti,
    id=2
)

In [18]:
niti_upload

{'_index': 'person',
 '_id': '2',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

## Bulk Upload in Elastic search cluster

In [30]:
df = pd.read_csv("./amazon_prime_titles.csv")
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [31]:
del df["date_added"]
del df["country"]

In [32]:
df.isna().sum()

show_id            0
type               0
title              0
director        2082
cast            1233
release_year       0
rating           337
duration           0
listed_in          0
description        0
dtype: int64

In [33]:
df = df.dropna()

In [34]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [35]:
df.shape

(6646, 10)

In [38]:
df_dict = df.to_dict("records")

In [40]:
df_dict[0]

{'show_id': 's2',
 'type': 'Movie',
 'title': 'Take Care Good Night',
 'director': 'Girish Joshi',
 'cast': 'Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar',
 'release_year': 2018,
 'rating': '13+',
 'duration': '110 min',
 'listed_in': 'Drama, International',
 'description': 'A Metro Family decides to fight a Cyber Criminal threatening their stability and pride.'}

### Convert data into ELK format

In [42]:
def generator(df):
    for i, data in enumerate(df):
        yield {
            "_index": "amazon_prime",
            "_id": data.get("show_id", None),
            "_source": {
                "title": data.get("title", ""),
                "type": data.get("type", ""),
                "director": data.get("director", ""),
                "cast": data.get("cast", "")
            }
        }
    raise StopIteration

In [43]:
data_for_elk = generator(df.to_dict("records"))

In [45]:
next(data_for_elk)

{'_index': 'amazon_prime',
 '_id': 's17',
 '_source': {'title': 'Zoombies',
  'type': 'Movie',
  'director': 'Glenn Miller',
  'cast': 'Marcus Anderson, Kaiwi Lyman, Andrew Asper'}}

### Settings and mapping

In [46]:
settings = {
    "settings": {
        "number_of_replicas": 0,
        "number_of_shards": 1
    },
    "mappings": {
        "properties": {
            "type": {
                "type": "text",
            },
            "cast": {
                "type": "text"
            }
        }
    }
}

In [51]:
amazon_index = es.indices.create(index="amazon_prime", body=settings)

In [52]:
amazon_index

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'amazon_prime'}

In [54]:
try:
    res = helpers.bulk(es, generator(df_dict))
except Exception as e:
    print(e)
    pass

generator raised StopIteration


In [57]:
pprint(res)

{'.apm-agent-configuration': {'aliases': {}},
 '.apm-custom-link': {'aliases': {}},
 '.async-search': {'aliases': {}},
 '.kibana-event-log-8.3.2-000001': {'aliases': {'.kibana-event-log-8.3.2': {'is_hidden': True,
                                                                            'is_write_index': True}}},
 '.kibana_8.3.2_001': {'aliases': {'.kibana': {'is_hidden': True},
                                   '.kibana_8.3.2': {'is_hidden': True}}},
 '.kibana_security_session_1': {'aliases': {}},
 '.kibana_task_manager_8.3.2_001': {'aliases': {'.kibana_task_manager': {'is_hidden': True},
                                                '.kibana_task_manager_8.3.2': {'is_hidden': True}}},
 '.security-7': {'aliases': {'.security': {'is_hidden': True}}},
 '.tasks': {'aliases': {}},
 'kibana_sample_data_ecommerce': {'aliases': {}},
 'my-foo': {'aliases': {}},
 'person': {'aliases': {}}}


## Querying

## MATCH

### match_phrase = Matches the exact phrase

`GET amazon_prime/_search
{
  "_source": ["title", "cast", "director"],
  "size": 2000,
  "query": {
    "match_phrase": {
      "title": "one great love"
    }
  }
}`

### match = matches with document having any one of the words in query

`
GET amazon_prime/_search
{
  "_source": ["title", "cast", "director"],
  "size": 2000,
  "query": {
    "match": {
      "title": "love great one"
    }
  }
}
`

## AGGREGATION

###

In [1]:
from datetime import datetime, timedelta
from itertools import product

years = ['2019', '2020', '2021']
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = months + list(range(13, 29))
# days = ["01"]
years = ['2021'] # change til 2021
indexes = []

datetime_now = datetime.utcnow()
past_three_months = datetime_now - timedelta(days=2 * 29)
# past_three_months = datetime_now
# past_three_months = past_three_months.replace(day=1)
months_till_to_delete = '{:02d}'.format(past_three_months.month)
for y, m, d in product(years, months, days):
    # if m in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] and y == "2019":
    #     continue
    if y == "2021" and m in ["01"]:
        continue
    if m == months_till_to_delete and y == "2021":
        break
    indexes.append((y, m, d))
print(indexes)
indexes.reverse()

[('2021', '02', '01'), ('2021', '02', '02'), ('2021', '02', '03'), ('2021', '02', '04'), ('2021', '02', '05'), ('2021', '02', '06'), ('2021', '02', '07'), ('2021', '02', '08'), ('2021', '02', '09'), ('2021', '02', '10'), ('2021', '02', '11'), ('2021', '02', '12'), ('2021', '02', 13), ('2021', '02', 14), ('2021', '02', 15), ('2021', '02', 16), ('2021', '02', 17), ('2021', '02', 18), ('2021', '02', 19), ('2021', '02', 20), ('2021', '02', 21), ('2021', '02', 22), ('2021', '02', 23), ('2021', '02', 24), ('2021', '02', 25), ('2021', '02', 26), ('2021', '02', 27), ('2021', '02', 28), ('2021', '03', '01'), ('2021', '03', '02'), ('2021', '03', '03'), ('2021', '03', '04'), ('2021', '03', '05'), ('2021', '03', '06'), ('2021', '03', '07'), ('2021', '03', '08'), ('2021', '03', '09'), ('2021', '03', '10'), ('2021', '03', '11'), ('2021', '03', '12'), ('2021', '03', 13), ('2021', '03', 14), ('2021', '03', 15), ('2021', '03', 16), ('2021', '03', 17), ('2021', '03', 18), ('2021', '03', 19), ('2021', '0

In [4]:
past_three_months

datetime.datetime(2022, 5, 27, 17, 32, 22, 141507)