# 0. Load imports 

In [23]:
## imports
import pandas as pd
import numpy as np
import re


## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## load data on 2020 crimes in DC
dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")

# 1. Questions: list comprehension and misc. list questions

- Confused generally
- In class example, why did we need the "courses" at the beginning of the list iteration
- How did the join syntax work in the example where we paste together offenses from same ward

In [17]:
## toy example

### pool of courses
all_courses = ["QSS20", "QSS17", "GOV10", "GOV4", "COSC1"]


## 1.1 Application 1: filtering to a smaller list

When we might use: have a lot of columns in a dataframe; want to filter to a smaller set using some pattern

In [18]:
### pull out ones that contain GOV in the string
gov_c = [course for course in all_courses
        if "GOV" in course]
gov_c # result
print(type(gov_c)) # a list


['GOV10', 'GOV4']

<class 'list'>


In [19]:
### showing that the "course" is just a placeholder/
### arbitrary interator
gov_c_alt = [x for x in all_courses
        if "GOV" in x]

if gov_c == gov_c_alt:
    print("same obj")
else:
    print("diff obj")

same obj


In [20]:
### what happens if we use the same syntax
### but don't have course at the beginning?
[for course in all_courses
if "GOV" in course]

### gives us error about invalid syntax
### reason is we need to tell it what to return

SyntaxError: invalid syntax (<ipython-input-20-a2d21592ece3>, line 3)

## 1.2 Application two: keep all objects in the list but do some transformation

In [31]:
## strip the numbers from the course names
all_courses_prefixonly = [re.sub(r"\d+", "", x) # dont worry about understanding the pattern yet; will cover in regex week
                          for x in all_courses]

all_courses_prefixonly # could then find unique elements

## using original list, add dartmouth prefix to the course name
all_courses_add_dprefix = ["dartmouth_" + x
                          for x in all_courses]

all_courses_add_dprefix


['QSS', 'QSS', 'GOV', 'GOV', 'COSC']

['dartmouth_QSS20',
 'dartmouth_QSS17',
 'dartmouth_GOV10',
 'dartmouth_GOV4',
 'dartmouth_COSC1']

## 1.3 Using to help with subsetting columns

In [36]:
## print all columns in the crime report data
dc_crim_2020.columns

## use list comprehension to
## filter to columns with
## id in the string
id_cols = [col for col in 
          dc_crim_2020.columns
          if "ID" in col]
id_cols
print(type(id_cols))

## then, can filter the data
dc_crim_2020_idonly = dc_crim_2020[id_cols].copy()
dc_crim_2020_idonly.head()

Index(['X', 'Y', 'CCN', 'REPORT_DAT', 'SHIFT', 'METHOD', 'OFFENSE', 'BLOCK',
       'XBLOCK', 'YBLOCK', 'WARD', 'ANC', 'DISTRICT', 'PSA',
       'NEIGHBORHOOD_CLUSTER', 'BLOCK_GROUP', 'CENSUS_TRACT',
       'VOTING_PRECINCT', 'LATITUDE', 'LONGITUDE', 'BID', 'START_DATE',
       'END_DATE', 'OBJECTID', 'OCTO_RECORD_ID'],
      dtype='object')

['BID', 'OBJECTID', 'OCTO_RECORD_ID']

<class 'list'>


Unnamed: 0,BID,OBJECTID,OCTO_RECORD_ID
0,DUPONT CIRCLE,97431273,
1,,97431275,
2,,97431276,
3,,97431278,
4,,97431285,


# 2. Questions: lambda functions

Two questions:

- General syntax (see here for a reference: https://www.w3schools.com/python/python_lambda.asp 
- How they work in the context of aggregations

How is a lambda function different from a "normal" user-defined function (that has the syntax def func_name(arg): etc?

- Operates similarly to normal user-defined functions in that it can take any # of arguments
- Operates differently in that it's an "anonymous" function or a function that we don't explicitly name/save in memory

## 2.1 General syntax for lambda functions

In [42]:
## generalize some of the steps
## above into a two-arg function
## that takes the course prefix
## and a list of all courses
def return_relcourses(prefix: str,
                    all_courses: list):
    rel_courses = [c for c in all_courses if prefix in c]
    return(rel_courses)

### execute on two pools of courses
socsci = ["QSS20", "QSS17", "GOV10"]
natsci = ["BIO2", "PHYS3"]

### a few applications 
return_relcourses(prefix = "QSS", all_courses = socsci)
return_relcourses(prefix = "QSS", all_courses = natsci)
return_relcourses(prefix = "BIO", all_courses = natsci)

## what's the lambda function version of this
return_rel_courses_l = lambda prefix, all_courses: [c for c in all_courses if prefix in c]

return_rel_courses_l(prefix = "BIO", all_courses = natsci)

## didnt save us that much space then but in context of agg can help us
## do things more quickly. in general, not super high priority

['QSS20', 'QSS17']

[]

['BIO2']

['BIO2']

## 2.2 using alongside agg

In [50]:
## use lambda to find modal block in a ward- multiple ways

### way 1: subsetting agg syntex
dc_crim_2020.groupby("WARD")["BLOCK"].agg(lambda x: x.mode())

### way 2: dictionary agg syntax
dc_crim_2020.groupby("WARD").agg({"BLOCK": lambda x: x.mode()}
                                )


WARD
1           3100 - 3299 BLOCK OF 14TH STREET NW
2    1300 - 1699 BLOCK OF CONNECTICUT AVENUE NW
3      5300 - 5399 BLOCK OF WISCONSIN AVENUE NW
4          100 - 199 BLOCK OF CARROLL STREET NW
5     900 - 999 BLOCK OF RHODE ISLAND AVENUE NE
6                600 - 699 BLOCK OF H STREET NE
7         934 - 1099 BLOCK OF EASTERN AVENUE NE
8        2300 - 2399 BLOCK OF GOOD HOPE ROAD SE
Name: BLOCK, dtype: object

Unnamed: 0_level_0,BLOCK
WARD,Unnamed: 1_level_1
1,3100 - 3299 BLOCK OF 14TH STREET NW
2,1300 - 1699 BLOCK OF CONNECTICUT AVENUE NW
3,5300 - 5399 BLOCK OF WISCONSIN AVENUE NW
4,100 - 199 BLOCK OF CARROLL STREET NW
5,900 - 999 BLOCK OF RHODE ISLAND AVENUE NE
6,600 - 699 BLOCK OF H STREET NE
7,934 - 1099 BLOCK OF EASTERN AVENUE NE
8,2300 - 2399 BLOCK OF GOOD HOPE ROAD SE


#### ones with deliberate errors- run to see the error msg

In [None]:
### what happens if we try without lambda?
### doesnt know that x == BLOCK series
dc_crim_2020.groupby("WARD").agg({"BLOCK": x.mode()}
                                )


In [None]:
### what happens if we try to put mode in quotes
### like a pandas built-in method?
### mode isn't an attribute of a grouped series
dc_crim_2020.groupby("WARD").agg({"BLOCK": "mode"}
                                )

## 2.3 breaking down agg and similarities to tidyverse

- Confused by what a grouped by object is and how we use it

Let's break down groupby and agg into two separate steps for more insight under the hood

In [76]:
## step 1: group by 1+ attributes (for R users, similar to df %>% group_by(x) in dplyr)
dc_crim_groupbyward = dc_crim_2020.groupby("WARD")

dc_crim_groupbyward # type is a groupby object
print(type(dc_crim_groupbyward)) 

## step 2: do something with the grouped dataframe 
### what we've covered: aggregation- here, finding
### the mean latitude of crime reports within a DC ward
## (for R users, similar to the next stage in the pipe where you summarize the grouped object,
## eg df %>% group_by(x) %>% summarise(mean_lat = mean(LATITUDE)) %>% ungroup())
dc_crim_groupbyward.agg({"LATITUDE": "mean"})

### other things (not covered yet): iterate over groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff468bc6df0>

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


Unnamed: 0_level_0,LATITUDE
WARD,Unnamed: 1_level_1
1,38.92471
2,38.905938
3,38.94155
4,38.957667
5,38.920184
6,38.893125
7,38.888703
8,38.850529


## 2.4 agg versus apply

Short answer to when to use which: whatever works is fine!
    
Longer answer: agg is especially useful for grouped dataframes (so summarizing some attribute across groups); apply for operating on the whole dataframe or select columns without grouping

# 3. Questions: np.where,  np.select, is.in versus str.contains

- When should we use or not use np.where? 
    - When to use: when creating new indicators (see example below of way to create without np.where)
    - When not to use: if it becomes too deeply nested; shift to np.select or map.recode
    
- When should we use is.in versus str.contains?

## 3.1 np.where and str.contains versus is.in 

In general, fine to do whatever works; is.in works better if you want to look inside a list; str.contains for simple regex

In [59]:
## view examples of police shift
dc_crim_2020.SHIFT.value_counts()

## example of multiple equivalent ways to create t/f indicator
## for whether shift is in midnight or evening

## way 1: omit np.where and just run is in condition
dc_crim_2020['is_evening_shift'] = dc_crim_2020.SHIFT.isin(["EVENING", "MIDNIGHT"])
dc_crim_2020.is_evening_shift.value_counts()

## way 2: same as above using is.in but using np.where
## reason? a bit easier to read
dc_crim_2020['is_evening_shift_alt1'] = np.where(dc_crim_2020.SHIFT.isin(["EVENING", "MIDNIGHT"]),
                                                 True, False)
dc_crim_2020.is_evening_shift_alt1.value_counts()

## way 3: np.where + str.contains rather than is in
## main difference is instead of looking in a list,
## I need to look for evening OR (|) midnight
dc_crim_2020['is_evening_shift_alt2'] = np.where(dc_crim_2020.SHIFT.str.contains("EVENING|MIDNIGHT"),
                                                 True, False)
dc_crim_2020.is_evening_shift_alt2.value_counts()

EVENING     12315
DAY         10092
MIDNIGHT     5508
Name: SHIFT, dtype: int64

True     17823
False    10092
Name: is_evening_shift, dtype: int64

True     17823
False    10092
Name: is_evening_shift_alt1, dtype: int64

True     17823
False    10092
Name: is_evening_shift_alt2, dtype: int64

#### deliberate errors

run to see error

In [None]:
## what happens if we combine elements of str.contain with elements of is.in
## throws error bc of incorrect syntax within str.contains (needs a single pattern
## as an arg rather than a list)
np.where(dc_crim_2020.SHIFT.str.contains(["EVENING", "MIDNIGHT"]),
                            True, False)

## 3.2 np.select

Would run through example in the activity (creating the `offense_summary` variable)
and ask any specific q's on Slack

# 4: misc (why use copy? where does + sign on slide 28 come from when filtering columns, subsetting syntax)

## 4.1 .copy() + subsetting rows

- When to use? When subsetting rows or cols in a df and assigning them to a new object
- When not to use? generally, I almost always use; but if you DONT use, you'll just get a warning rather than error

In [None]:
### deliberate issue

In [90]:
## example: subsetting to ward 4 without copy
### subset
ward4_nocopy = dc_crim_2020[dc_crim_2020.WARD == 4]

### get copy error warning when i do something else
### to the ward4 df
ward4_nocopy["is_day"] = ward4_nocopy.SHIFT == "DAY"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ward4_nocopy["is_day"] = ward4_nocopy.SHIFT == "DAY"


In [91]:
### redoing but with copy 
ward4_withcopy = dc_crim_2020[dc_crim_2020.WARD == 4].copy()

### get copy error warning when i do something else
### to the ward8 df
ward4_withcopy["is_day"] = ward4_withcopy.SHIFT == "DAY"

### yay no error :) 

## 4.2 subsetting data to specific columns- why the "+"" in slide 28 syntax? + what did the last slide do


In [95]:
## let's create a fake dataset with two years worth of gpa
## and units
student_data = pd.DataFrame({'student_id': ["id1", "id2", "id3", "id4"], # can do more efficiently using sequence
                            'gpa_2020': [3.4, 3.6, 3.1, 2.9],
                             'ncourses_2020': [8, 8, 8, 7],
                            'gpa_2021': [3.8, 3.9, 3.9, 3.5],
                            'ncourses_2021': [8, 9, 9, 5]})

student_data


Unnamed: 0,student_id,gpa_2020,ncourses_2020,gpa_2021,ncourses_2021
0,id1,3.4,8,3.8,8
1,id2,3.6,8,3.9,9
2,id3,3.1,8,3.9,9
3,id4,2.9,7,3.5,5


Unnamed: 0,student_id,gpa_2021,ncourses_2021
0,id1,3.8,8
1,id2,3.9,9
2,id3,3.9,9
3,id4,3.5,5


In [96]:
## long way to get just their 2021 grades and units
student_data_21 = student_data[["student_id", "gpa_2021", "ncourses_2021"]].copy()
student_data_21


Unnamed: 0,student_id,gpa_2021,ncourses_2021
0,id1,3.8,8
1,id2,3.9,9
2,id3,3.9,9
3,id4,3.5,5


In [99]:
## can make more efficient by using the fact that 
## 2021 is in both cols
info_2021 = [col for col in student_data.columns
            if "2021" in col]
info_2021
print(type(info_2021))

## then, to subset to those cols + student_id,
## I can add the lists together 
## to make a list with both (appending; multiple 
## ways to do)
info_id_2021 = ["student_id"] + info_2021
info_id_2021
print(type(info_id_2021))

## code on slide is just doing above in one step
student_data_21_alt = student_data[["student_id"] +
                                [col for col in student_data.columns
                                if "2021" in col]].copy()
student_data_21_alt

['gpa_2021', 'ncourses_2021']

<class 'list'>


['student_id', 'gpa_2021', 'ncourses_2021']

<class 'list'>


Unnamed: 0,student_id,gpa_2021,ncourses_2021
0,id1,3.8,8
1,id2,3.9,9
2,id3,3.9,9
3,id4,3.5,5
