In [1]:
#Regular Expressions (Regex)
#Regular #expressions are sequences of characters that form a search pattern. 
#They are often used for string matching and manipulation.

#Compiling a Pattern
#In Python, you can compile a regular expression pattern using the re.compile() function. 
#This allows you to reuse the same pattern multiple times efficiently.


In [3]:
import re

pattern = re.compile(r'\d+')  # Matches one or more digits


#In this example, \d+ is a regex pattern that matches one or more digits.
#The r before the string denotes a raw string, which we will discuss later.

In [4]:
#Flags - ignorecase, dotall
#Flags modify the behavior of regex matching. Two common flags are re.IGNORECASE and re.DOTALL.

#re.IGNORECASE or re.I: Ignores case when matching.

pattern = re.compile(r'hello', re.IGNORECASE)
result = pattern.search('HeLLo world')
print(result.group())  # Outputs: 'HeLLo'
  

HeLLo


In [5]:
#re.DOTALL or re.S: Allows the dot (.) to match newline characters.
pattern = re.compile(r'hello.world', re.DOTALL)
result = pattern.search('hello\nworld')
print(result.group())  # Outputs: 'hello\nworld'


hello
world


In [6]:
#Working with Multiple Flags
#You can combine flags using the bitwise OR operator (|).
pattern = re.compile(r'hello.world', re.IGNORECASE | re.DOTALL)
result = pattern.search('HeLLo\nworld')
print(result.group())  # Outputs: 'HeLLo\nworld'


HeLLo
world


In [7]:
#Search vs Match
#re.search(): Searches for the pattern anywhere in the string.
result = re.search(r'\d+', 'abc 123 def')
print(result.group())  # Outputs: '123'


123


In [8]:
#re.match(): Only matches if the pattern is at the beginning of the string.
result = re.match(r'\d+', '123 def')
print(result.group())  # Outputs: '123'

result = re.match(r'\d+', 'abc 123 def')
print(result)  # Outputs: None


123
None


In [9]:
#Raw String Notations
#Raw strings (r'string') treat backslashes as literal characters, which is useful in regex patterns.
pattern = re.compile(r'\d+')  # Raw string notation
result = pattern.search('abc 123 def')
print(result.group())  # Outputs: '123'


123


In [10]:
#Without raw strings, you'd need to escape backslashes, making patterns harder to read:
pattern = re.compile('\\d+')


Special Characters
.: Matches any character except a newline.
^: Matches the start of the string.
$: Matches the end of the string.
*: Matches 0 or more repetitions of the preceding element.
+: Matches 1 or more repetitions of the preceding element.
?: Matches 0 or 1 repetition of the preceding element.
{m,n}: Matches from m to n repetitions of the preceding element.

In [11]:
pattern = re.compile(r'.')  # Matches any character except newline
pattern = re.compile(r'^hello')  # Matches 'hello' at the start of the string
pattern = re.compile(r'world$')  # Matches 'world' at the end of the string
pattern = re.compile(r'a*')  # Matches 0 or more 'a' characters
pattern = re.compile(r'a+')  # Matches 1 or more 'a' characters
pattern = re.compile(r'a?')  # Matches 0 or 1 'a' character
pattern = re.compile(r'a{2,4}')  # Matches between 2 and 4 'a' characters


Globbing Characters
Globbing characters are often used in file path patterns. While not strictly regex, they are similar:

*: Matches any number of any characters.
?: Matches any single character.

In [12]:
import glob

files = glob.glob('*.txt')  # Matches all .txt files in the current directory


In [13]:
"""Anchors
^: Start of a string.
$: End of a string.
\b: Word boundary.
\B: Not a word boundary.
"""

'Anchors\n^: Start of a string.\n$: End of a string.\n\x08: Word boundary.\n\\B: Not a word boundary.\n'

In [14]:
pattern = re.compile(r'^\d+$')  # Matches a string composed entirely of digits


In [15]:
#Character Sets
#Character sets allow you to match one out of several characters.
pattern = re.compile(r'[aeiou]')  # Matches any vowel
result = pattern.search('hello')
print(result.group())  # Outputs: 'e'


e


In [16]:
#You can also use ranges within character sets:

In [17]:
pattern = re.compile(r'[a-z]')  # Matches any lowercase letter


In [18]:
#Grouping
#Grouping is done using parentheses (), allowing you to capture parts of the match.

In [19]:
pattern = re.compile(r'(\d+)-(\d+)')
result = pattern.search('123-456')
if result:
    print(result.group(1))  # Outputs: '123'
    print(result.group(2))  # Outputs: '456'


123
456


In [20]:
"""
Summary
Regular expressions are powerful tools for text processing. Here are the key concepts:

Compiling a Pattern: Use re.compile() for efficiency.
Flags: Modify regex behavior (re.IGNORECASE, re.DOTALL).
Search vs Match: re.search() finds patterns anywhere; re.match() only at the start.
Raw Strings: Use r'' for regex patterns to avoid escaping backslashes.
Special Characters: Understand ., ^, $, *, +, ?, {m,n}.
Globbing: Use * and ? for file path patterns.
Anchors: Use ^, $, \b, \B to anchor patterns.
Character Sets: Use [] to match specific characters.
Grouping: Use () to capture groups in patterns."""

"\nSummary\nRegular expressions are powerful tools for text processing. Here are the key concepts:\n\nCompiling a Pattern: Use re.compile() for efficiency.\nFlags: Modify regex behavior (re.IGNORECASE, re.DOTALL).\nSearch vs Match: re.search() finds patterns anywhere; re.match() only at the start.\nRaw Strings: Use r'' for regex patterns to avoid escaping backslashes.\nSpecial Characters: Understand ., ^, $, *, +, ?, {m,n}.\nGlobbing: Use * and ? for file path patterns.\nAnchors: Use ^, $, \x08, \\B to anchor patterns.\nCharacter Sets: Use [] to match specific characters.\nGrouping: Use () to capture groups in patterns."

In [21]:
"""
Scenario: Log File Analysis
You are a software engineer working for a company that maintains a large web application.
The application logs various events to log files, and your task is to analyze these log files 
to extract useful information. The log files contain lines with the following format:

"""

'\nScenario: Log File Analysis\nYou are a software engineer working for a company that maintains a large web application.\nThe application logs various events to log files, and your task is to analyze these log files \nto extract useful information. The log files contain lines with the following format:\n\n'

In [22]:
#Task 1: Extracting All Timestamps
#Write a Python function to extract all timestamps from the log file.
import re

log_data = """
[2024-07-18 10:23:45] [ERROR] [Authentication] User login failed
[2024-07-18 10:24:00] [INFO] [Payment] Payment processed successfully
[2024-07-18 10:25:30] [WARNING] [Database] Connection timeout
[2024-07-18 10:27:15] [INFO] [Authentication] User login succeeded
"""

def extract_timestamps(log_data):
    pattern = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]')
    return pattern.findall(log_data)

timestamps = extract_timestamps(log_data)
print(timestamps)


['2024-07-18 10:23:45', '2024-07-18 10:24:00', '2024-07-18 10:25:30', '2024-07-18 10:27:15']


In [23]:
#Task 2: Counting Different Log Levels
#Write a Python function to count the number of occurrences of each log level (ERROR, INFO, WARNING).
def count_log_levels(log_data):
    pattern = re.compile(r'\[(ERROR|INFO|WARNING)\]')
    matches = pattern.findall(log_data)
    return {level: matches.count(level) for level in set(matches)}

log_levels_count = count_log_levels(log_data)
print(log_levels_count)




In [24]:
#Task 3: Extracting Messages from a Specific Module
#Write a Python function to extract all log messages from a specific module (e.g., "Authentication").
def extract_module_messages(log_data, module_name):
    pattern = re.compile(r'\[.*?\] \[.*?\] \[' + re.escape(module_name) + r'\] (.*)')
    return pattern.findall(log_data)

auth_messages = extract_module_messages(log_data, 'Authentication')
print(auth_messages)


['User login failed', 'User login succeeded']


In [25]:
#Task 4: Identifying and Extracting User Login Failures
#Write a Python function to identify and extract the timestamps of all user login failures.
def extract_login_failures(log_data):
    pattern = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] \[ERROR\] \[Authentication\] User login failed')
    return pattern.findall(log_data)

login_failures = extract_login_failures(log_data)
print(login_failures)


['2024-07-18 10:23:45']


In [26]:
#Task 5: Extracting and Grouping Data
#Write a Python function to extract all log data and group it by date.
def group_logs_by_date(log_data):
    pattern = re.compile(r'\[(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2}\] \[(ERROR|INFO|WARNING)\] \[(.*?)\] (.*)')
    matches = pattern.findall(log_data)
    logs_by_date = {}
    for date, log_level, module, message in matches:
        if date not in logs_by_date:
            logs_by_date[date] = []
        logs_by_date[date].append({'log_level': log_level, 'module': module, 'message': message})
    return logs_by_date

logs_grouped_by_date = group_logs_by_date(log_data)
print(logs_grouped_by_date)


