# Practica 4: Ingesta de datos con Logstash

In [5]:
!curl -X PUT http://elasticsearch:9200/_index_template/web-logs \
      -H 'Content-Type: application/json' \
      -d "@../data/web_logs/elasticsearch/web-logs-template.json"

{"acknowledged":true}

In [None]:
!curl -X PUT http://elasticsearch:9200/_ingest/pipeline/web-logs \
      -H 'Content-Type: application/json' \
      -d "@../data/web_logs/elasticsearch/web-logs-pipeline.json"

In [4]:
!curl -X PUT http://elasticsearch:9200/_ingest/pipeline/web-logs -H 'Content-Type: application/json' -d' \
{ \
  "description" : "Pipeline for parsing Apache HTTP Server access logs. Requires the geoip and user_agent plugins.", \
  "processors" : [ \
    { \
      "set": { \
        "field": "event.original", \
        "value": "{{message}}" \
      } \
    }, \
    { \
      "set" : { \
        "field" : "event.ingested", \
        "value" : "{{_ingest.timestamp}}" \
      } \
    }, \
    { \
      "grok" : { \
        "patterns" : [ \
          "%{IPORHOST:destination.domain} %{IPORHOST:source.ip} - %{DATA:user.name} \\[%{HTTPDATE:apache.access.time}\\] \"(?:%{WORD:http.request.method} %{DATA:url.original} HTTP/%{NUMBER:http.version}|-)?\" %{NUMBER:http.response.status_code:long} (?:%{NUMBER:http.response.body.bytes:long}|-)( \"%{DATA:http.request.referrer}\")?( \"%{DATA:user_agent.original}\")?", \
          "%{IPORHOST:source.address} - %{DATA:user.name} \\[%{HTTPDATE:apache.access.time}\\] \"(?:%{WORD:http.request.method} %{DATA:url.original} HTTP/%{NUMBER:http.version}|-)?\" %{NUMBER:http.response.status_code:long} (?:%{NUMBER:http.response.body.bytes:long}|-)( \"%{DATA:http.request.referrer}\")?( \"%{DATA:user_agent.original}\")?", \
          "%{IPORHOST:source.address} - %{DATA:user.name} \\[%{HTTPDATE:apache.access.time}\\] \"-\" %{NUMBER:http.response.status_code:long} -", \
          "\\[%{HTTPDATE:apache.access.time}\\] %{IPORHOST:source.address} %{DATA:apache.access.ssl.protocol} %{DATA:apache.access.ssl.cipher} \"%{WORD:http.request.method} %{DATA:url.original} HTTP/%{NUMBER:http.version}\" (-|%{NUMBER:http.response.body.bytes:long})" \
        ], \
        "ignore_missing" : true, \
        "field" : "message" \
      } \
    }, \
    { \
      "remove" : { \
        "field" : "message" \
      } \
    }, \
    { \
      "set" : { \
        "field" : "event.kind", \
        "value" : "event" \
      } \
    }, \
    { \
      "set" : { \
        "field" : "event.category", \
        "value" : "web" \
      } \
    }, \
    { \
      "set" : { \
        "value" : "success", \
        "if" : "ctx?.http?.response?.status_code != null && ctx.http.response.status_code < 400", \
        "field" : "event.outcome" \
      } \
    }, \
    { \
      "set" : { \
        "field" : "event.outcome", \
        "value" : "failure", \
        "if" : "ctx?.http?.response?.status_code != null && ctx.http.response.status_code > 399" \
      } \
    }, \
    { \
      "grok" : { \
        "field" : "source.address", \
        "ignore_missing" : true, \
        "patterns" : [ \
          "^(%{IP:source.ip}|%{HOSTNAME:source.domain})$" \
        ] \
      } \
    }, \
    { \
      "rename" : { \
        "target_field" : "event.created", \
        "field" : "@timestamp" \
      } \
    }, \
    { \
      "date" : { \
        "ignore_failure" : true, \
        "field" : "apache.access.time", \
        "target_field" : "@timestamp", \
        "formats" : [ \
          "dd/MMM/yyyy:H:m:s Z" \
        ] \
      } \
    }, \
    { \
      "remove" : { \
        "field" : "apache.access.time", \
        "ignore_failure" : true \
      } \
    }, \
    { \
      "user_agent" : { \
        "field" : "user_agent.original", \
        "ignore_failure" : true \
      } \
    }, \
    { \
      "geoip" : { \
        "field" : "source.ip", \
        "target_field" : "source.geo", \
        "ignore_missing" : true \
      } \
    }, \
    { \
      "geoip" : { \
        "target_field" : "source.as", \
        "properties" : [ \
          "asn", \
          "organization_name" \
        ], \
        "ignore_missing" : true, \
        "database_file" : "GeoLite2-ASN.mmdb", \
        "field" : "source.ip" \
      } \
    }, \
    { \
      "rename" : { \
        "field" : "source.as.asn", \
        "target_field" : "source.as.number", \
        "ignore_missing" : true \
      } \
    }, \
    { \
      "rename" : { \
        "ignore_missing" : true, \
        "field" : "source.as.organization_name", \
        "target_field" : "source.as.organization.name" \
      } \
    }, \
    { \
      "set" : { \
        "field" : "tls.cipher", \
        "value" : "{{apache.access.ssl.cipher}}", \
        "ignore_empty_value" : true \
      } \
    }, \
    { \
      "script" : { \
        "lang" : "painless", \
        "if" : "ctx?.apache?.access?.ssl?.protocol != null", \
        "source" : "def parts = ctx.apache.access.ssl.protocol.toLowerCase().splitOnToken(\"v\"); if (parts.length != 2) {\n  return;\n} if (parts[1].contains(\".\")) {\n  ctx.tls.version = parts[1];\n} else {\n  ctx.tls.version = parts[1] + \".0\";\n} ctx.tls.version_protocol = parts[0];" \
      } \
    } \
  ], \
  "on_failure" : [ \
    { \
      "set" : { \
        "field" : "error.message", \
        "value" : "{{ _ingest.on_failure_message }}" \
      } \
    } \
  ] \
}'

{"acknowledged":true}

66.249.66.194 - - [22/Jan/2019:03:57:54 +0330] "GET /m/filter/b1,p6 HTTP/1.1" 200 19486 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"

Definición del pieline de ingesta

`
input {
    path => "/tmp/data/*"
}

filter {
  mutate {
    remove_field => ["host", "@version"]
  }
}

output {
  stdout {
    codec => dots {}
  }

  elasticsearch {
    hosts => "localhost:9200"
    user => "elastic"
    password => "<password>"
    index => "web-logs"
    pipeline => "web-logs"
  }
}
`



In [None]:
docker run --rm -it --network=datahack-nosql_default \
    -v /Users/rgarrote/desarrollo/datahack-nosql/work/data/elasticsearch/web_logs/pipeline/:/usr/share/logstash/pipeline/ \
    -v /Users/rgarrote/desarrollo/datahack-nosql/work/data/elasticsearch/web_logs/data/:/tmp/data/ \
docker.elastic.co/logstash/logstash:8.3.3

## 1. Find all the HTTP events with an HTTP response code of 200.

A term query can be used on the http.response.status_code field, as shown in the following request:

`
GET web-logs/_search
{
  "query": {
    "term": {
      "http.response.status_code": { "value": "200" }
    }
  }
}
`



## 2. Find all HTTP events where the request method was of the POST type and resulted in a non-200 response code.


Use two term queries within a bool compound query. The must and must_not clauses can be used to exclude all 200 response codes, as shown in the following request:

`
GET web-logs/_search
{
  "query": {
    "bool": {
      "must_not": [ { "term": { "http.response.status_code": { "value": "200" } } } ],
      "must": [ { "term": { "http.request.method": { "value": "POST" } } } ]
    }
  }
}
`



## 3. Find all HTTP events referencing the terms refrigerator and windows anywhere in the document.

A match query can be used on the event.original field. The and operator requires that both words (tokens) exist in the resulting document. Run the following query:

`
GET web-logs/_search
{
  "query": {
    "match": {
      "event.original":{
        "query": "refrigerator windows",
        "operator": "and"
      }
    }
  }
}
`



## 4. Look for all requests where users on Windows machines were looking at refrigerator-related pages on the website.


Use a bool compound query with two match queries, as shown in the following command block:

`
GET web-logs/_search
{
  "query": {
    "bool": {
      "must": [
        { "match": { "url.original.text": "refrigerator" } },
        { "match": { "user_agent.os.full.text": "windows" } } 
      ]
    }
  }
}
`



## 5. Look for all events originating from either South Africa, Ireland, or Hong Kong.


Use a terms match query to look for the existence of a term in the list of terms, as shown in the following command block:

`
GET web-logs/_search
{
  "query": {
    "terms": {
      "source.geo.country_name": [ "South Africa", "Ireland", "Hong Kong" ]
    }
  }
}
`

## 6. Find all HTTP GET events with response bodies of more than 100,000 bytes.


Use a bool query containing a term match for GET events and a range filter for the numeric http.response.body.bytes field, as shown here:

`
GET web-logs/_search
{
  "query": {
    "bool": {
      "must": [ 
        { "term": { "http.request.method": { "value":"GET" } } },
        { "range": { "http.response.body.bytes": { "gte":100000 } } }
      ]
    }
  }
}
`
