##**BIBLIOTEKA RE DO OBSŁUGI WYRAŻEŃ REGULARNYCH (RegEx)**



In [2]:
import re

In [3]:
def test(match):
  if match:
    print('found: ', match.group())
  else:
    print('match not found')

In [4]:
txt1 = 'abcdefghijk'
txt2 = 'abc@def#ghi%jkl^'
email1 = '123456@student.pwr.edu.pl'
email2 = 'jan.kowalski@gmail.com'
log = 'rose.glg.ed.ac.uk - - [27/Jul/1995:08:30:56 -0400] "GET /images/NASA-logosmall align=left HTTP/1.0" 200 786'

**search()**

In [5]:
match = re.search(r'\w+', txt1)
test(match)

found:  abcdefghijk


In [6]:
match = re.search(r'\w+', txt2)
test(match)

found:  abc


In [7]:
match1 = re.search(r'\w+@\w+', email1)
match2 = re.search(r'\w+@\w+', email2)
test(match1)
test(match2)

found:  123456@student
found:  kowalski@gmail


In [8]:
match1 = re.search(r'[\w.]+@[\w.]+', email1)
match2 = re.search(r'[\w.]+@[\w.]+', email2)
test(match1)
test(match2)

found:  123456@student.pwr.edu.pl
found:  jan.kowalski@gmail.com


**grupy**

In [9]:
match1 = re.search(r'([\w.]+)@([\w.]+)', email1)
match2 = re.search(r'([\w.]+)@([\w.]+)', email2)

test(match1)
print(match1.groups())
test(match2)
print(match2.groups())

found:  123456@student.pwr.edu.pl
('123456', 'student.pwr.edu.pl')
found:  jan.kowalski@gmail.com
('jan.kowalski', 'gmail.com')


**findall()**

In [10]:
match = re.findall(r'\w+', txt2)
match

['abc', 'def', 'ghi', 'jkl']

In [11]:
match = re.findall(r'\w+', email1)
match

['123456', 'student', 'pwr', 'edu', 'pl']

**sub()**

In [12]:
sub = re.sub(r'([\w.]+)@([\w.]+)', r'\1@gmail.com', email1)
sub

'123456@gmail.com'

**compile() i złożone wyrażenia regex**

In [13]:
log_regex = re.compile(r'^(\S+) \S+ \S+ \[(\d{2}/\w{3}/\d{4}):(\d{2}:\d{2}:\d{2}) [-+]\d{4}\]'
                           r' "(?:[A-Z]+ )?([^\s"]*)(?:\s+\S*){0,3}" (\d{3}) (\d+|-)$')
match = log_regex.match(log)
print(match.groups())

('rose.glg.ed.ac.uk', '27/Jul/1995', '08:30:56', '/images/NASA-logosmall', '200', '786')


##**BIBLIOTEKA JSON**

In [14]:
import json

**dump() i dumps()**

In [15]:
print(json.dumps(match.groups(), indent=1))

[
 "rose.glg.ed.ac.uk",
 "27/Jul/1995",
 "08:30:56",
 "/images/NASA-logosmall",
 "200",
 "786"
]


In [21]:
log_dict = {
    'domain': match.group(1),
    'date': match.group(2),
    'time': match.group(3),
    'path': match.group(4),
    'code': int(match.group(5)),
    'size': int(match.group(6)),
}

In [22]:
log_json = json.dumps(log_dict, indent=2)
print(log_json)

{
  "domain": "rose.glg.ed.ac.uk",
  "date": "27/Jul/1995",
  "time": "08:30:56",
  "path": "/images/NASA-logosmall",
  "code": 200,
  "size": 786
}


In [23]:
with open('log.json', 'w') as file:
  json.dump(log_dict, file, indent=2)

**load() i loads()**

In [24]:
log_dict = json.loads(log_json)
type(log_dict)
print(log_dict)

{'domain': 'rose.glg.ed.ac.uk', 'date': '27/Jul/1995', 'time': '08:30:56', 'path': '/images/NASA-logosmall', 'code': 200, 'size': 786}


In [25]:
with open('log.json', 'r') as file:
  log_json = json.load(file)
print(log_json)

{'domain': 'rose.glg.ed.ac.uk', 'date': '27/Jul/1995', 'time': '08:30:56', 'path': '/images/NASA-logosmall', 'code': 200, 'size': 786}
