# Session 7 Demo

## Sorting

In [1]:
clients = ["Duanphen", "Bjorn", "Farah", "Alice", "Cayman", "Duanphen", "Esfir"]

# the 'sorted' fucntion doesn't change the original list
sorted(clients)

['Alice', 'Bjorn', 'Cayman', 'Duanphen', 'Duanphen', 'Esfir', 'Farah']

In [2]:
# original list is still the same
clients

['Duanphen', 'Bjorn', 'Farah', 'Alice', 'Cayman', 'Duanphen', 'Esfir']

In [3]:
# the 'sort' function does sorting 'in place,' or it changes the original list
clients.sort()
clients

['Alice', 'Bjorn', 'Cayman', 'Duanphen', 'Duanphen', 'Esfir', 'Farah']

In [None]:
# let's generate some random numbers and sort an np array
import numpy as np
numbers = np.random.randint(0, 100, 10)
numbers

In [None]:
numbers.sort()
numbers

In [None]:
# Let's make a new array of random numbers to show off some other functionality
numbers = np.random.randint(0, 100, 10)
numbers

In [None]:
# We can reverse the order of sorting (descending instead of ascending)
sorted(numbers, reverse=True)

In [None]:
# Let's pair tese up with random letters
letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
np.random.shuffle(letters)
d = dict(zip(letters, numbers))
d

In [None]:
# Sorting on a dictionary will sort on the keys
sorted(d)

In [None]:
# What if we want to sort by value?
# Here's one way to do it
sorted(d.items(), key=lambda item: item[1])

In [None]:
# Or if we only want the keys, we can do this.
sorted(d.keys(), key=lambda x: d[x])

Are you wondering, "What the heck is `lambda`?" [Learn more here](https://www.freecodecamp.org/news/python-lambda-function-explained/).

In [None]:
# We can also sort data frames
import pandas as pd

df = pd.DataFrame({'Name': ["Duanphen", "Bjorn", "Farah", "Alice", "Cayman", "Duanphen", "Esfir"],
                   'Age': [20, 28, 22, 21, 19, 17, 23],
                   'Rank': [1, np.nan, np.nan, 8, 9, 4, np.nan]})

df

In [None]:
df.sort_values(by=['Name'])

In [None]:
df.sort_values(by=['Age'], ascending=False)

## Basic Searching

In [None]:
# Let's define a list and a string for us to search
clients = ["Alice", "Bjorn", "Marie", "Cayman", "Duanphen", "Esfir", "Farah"]
bio = "The chief architect of this program is named Marie Eisenhower.  She received her degree in..."

In [None]:
# Is the string "Marie" an element in our list of clients?
"Marie" in clients

In [None]:
# What happens with a partial string?
"Mar" in clients

In [None]:
# What position (or index) can we find "Marie"?
clients.index("Marie")

In [None]:
# And if we try to find the inces for someone that's not there?
clients.index("Margot")

In [None]:
# What if we try to find Marie in the bio
"Marie" in bio

In [None]:
# Sewrching for partial strings work here, though!
# (It's looking for a string subset in a longer string either way.)
"Mar" in bio

In [None]:
# We can also look for the position/index
bio.index("Marie")

In [None]:
# This does the same thing!
bio.find("Marie")

In [None]:
# But you can't use "find" on lists
clients.find("Marie")

In [None]:
# You can also count the number of occurances in both strings and lists
bio = "The chief architect of this program is named Marie Eisenhower.  Marie received her degree in..."
bio.count("Marie")

## Regular Expressions 

Much of this demo is drawn from the [Python docs regex HOWTO](https://docs.python.org/3/howto/regex.html).

In [None]:
import re

### Methods

Let's start with the key methods used with reular expressions:
- `match`
- `search`
- `findall`
- `finditer`

In [None]:
# Determine if the RE matches at the beginning of the string.
# Note: match returns None if there is no match
re.match("Marie", bio)

In [None]:
# Determine if the RE matches at the beginning of the string.
# Here's what happens if there is a match!
re.match("The", bio)

In [None]:
# How do we actually use this?  Here's one way:
for test_str in ["Marie", "The", "My"]:
    if re.match(test_str, bio):
        print("The bio starts with '" + test_str + "'")
    else:
        print("The bio does NOT start with " + test_str + "'")

In [None]:
# But that doesn't give us any more functionality than the "startswith" string method
# And it's harder to use, so let's find some more compelling use cases...
bio.startswith("Maire")

In [None]:
# First, there are a few more methods
# 'search' scan through a string, looking for any location where the RE matches.
# The 'span' is where the first match occurs
re.search("Marie", bio)

In [None]:
# Like match, search returns None if the RE can't be found in the string
re.search("MOO", bio)

In [None]:
# Find all substrings where the RE matches, and returns them as a list.
# This is boring when the RE is just a string, but we'll see its power soon.
re.findall("Marie", bio)

In [None]:
# Find all substrings where the RE matches, and returns them as an iterator.
re.finditer("Marie", bio)

In [None]:
# We can stick the result in a loop
for result in re.finditer("Marie", bio):
    print(result)

### Metacharacters

Now that we've seen the key methods for regular expressions, let's get into the metacharacters.

`. ^ $ * + ? { } [ ] \ | ( )`

In [None]:
# The square brackets [] specify a character class, or set of characters that you wish to match. 

# Characters can be listed individually,
re.findall("[abc]", "Now I know my a, b, c's")

In [None]:
# Or a range of characters can be indicated by giving two characters and separating them by a '-'.
re.findall("[a-z]", "Now I know my a, b, c's")

In [None]:
# You can also complementing a set of charcters (match everything but the ones listed.
re.findall("[^abc]", "abcdefg")

In [None]:
# but the ^ character has to go first! Othreriwse it's treacted as a charcter to match
re.findall("[a^bc]", "abcdefg")

In [None]:
# The backslash is super important if you want to match a metacharacter or special sequence
re.findall("[\[]", "This will find [ all ] the [left] square brackets.")

In [None]:
# \d is for decimal digits; equivalent to [0-9]
re.findall("[\d]", "If you 5 want 7 to find 8364 all the decimal 00 digits, do this!")

Notable special sequences:
- `\d`: decimal digits; equivalent to `[0-9]`
- `\D`: non-digit characters; equivalent to `[^0-9]`
- `\s`: whitespace characters; equivalent to `[ \t\n\r\f\v]`
- `\S`: non-whitespace characters; equivalent to `[^ \t\n\r\f\v]`
- `\w`: alphanumeric characters; equivalent to `[a-zA-Z0-9_]`
- `\W`: non-alphanumeric character; equivalent to `[^a-zA-Z0-9_]`

In [None]:
# To get into the real power of regular expressions, we need to identify repetition
# The * character helps us do that.
re.findall("ca*t", "I like cats.  I like caaaaats.  I like caaaaaaaaaaats.  Tract.")

In [None]:
# The * is "greedy" meaning it wants to match as long a string a possible.
# Before running this line, try to guess what it will do!  Why?
re.findall("a[bcd]*b", "abcbd")

In [None]:
# Another repeating metacharacter is +, which matches one or more times
re.findall("ca+t", "I like cats.  I like caaaaats.  I like caaaaaaaaaaats.  Tract.")

In [None]:
# ? matches either once or zero times (the character is "optional")
re.findall("pre-?assignment", "This class has a preassignment.  The pre-assignment was ungraded.") 

In [None]:
# The most complicated quantifier is {m,n}, where m and n are decimal integers
# Ths means there must be at least m repetitions, and at most n.
re.findall("ca{1,3}t", "ct cat caat caaat caaaat caaaaat")

There are more details about using regular expressions, but this should get you started!

See the full [Python docs regex HOWTO](https://docs.python.org/3/howto/regex.html) for more.

## Computational efficiency
Two pieces of code can achieve the exact same result in differnt ways.  One thing to be aware of: depending on how you write your code, it can take a long time to run!  Rule of thumb: if you find yourself "waiting" for your code to finish running, it might be worth improving the efficiency of your code.  Consider these two examples and try them out with larger N!

In [None]:
import time

# Inefficient code to find the sum of all numbers from 1 to N
def inefficient_sum(N):
    start_time = time.time()
    total = 0
    for num in range(1, N+1):
        total += num
    end_time = time.time()
    execution_time = end_time - start_time
    print("Sum:", total)
    print("Execution Time:", execution_time)

# Efficient code to find the sum of all numbers from 1 to N 
def efficient_sum(N):
    start_time = time.time()
    total = (N * (N + 1)) // 2
    end_time = time.time()
    execution_time = end_time - start_time
    print("Sum:", total)
    print("Execution Time:", execution_time)

In [None]:
# Call the inefficient function with N = 10 million
N = 10000000
inefficient_sum(N)

In [None]:
# Call the inefficient function with N = 10 million
efficient_sum(N)

## Statisitics

Speaking of doing the same thing different ways...

In [None]:
# Base Python
import statistics

data = [10, 5, 8, 12, 3]

print("Minimum:", min(data))
print("Maximum:", max(data))
print("Sum:", sum(data))
print("Mean:", statistics.mean(data))
print("Median:", statistics.median(data))
print("Standard Deviation:", statistics.stdev(data))
print("Variance:", statistics.variance(data))

In [None]:
# NumPy
import numpy as np

data = np.array([10, 5, 8, 12, 3])

print("Minimum:", np.min(data))
print("Maximum:", np.max(data))
print("Sum:", np.sum(data))
print("Mean:", np.mean(data))
print("Median:", np.median(data))
print("Standard Deviation:", np.std(data))
print("Variance:", np.var(data))

In [None]:
# Pandas
import pandas as pd

data = pd.Series([10, 5, 8, 12, 3])

print("Minimum:", data.min())
print("Maximum:", data.max())
print("Sum:", data.sum())
print("Mean:", data.mean())
print("Median:", data.median())
print("Standard Deviation:", data.std())
print("Variance:", data.var())

NumPy is very effienct at computing these operations, and since Panadas is built on NumPy, it is too!  Just use whatever is convenient for the context you're in.