In [8]:
#####################################
#efficient python               #####
#Pythonic code == efficient code ####
#zen of Python 19 idioms   PEP20 ####
#import this                     ####
#####################################
#looping over a list.

names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

#collect the names in the list that have six letters or more.
# Print the list created using the Non-Pythonic approach
i = 0
new_list= []
while i < len(names):
    if len(names[i]) >= 6:
        new_list.append(names[i])
    i += 1
print(new_list)    

#A more Pythonic approach would loop over the contents of names
better_list = []
for name in names:
    if len(name) >= 6:
        better_list.append(name)
print(better_list)

#The best Pythonic way is by using list comprehension
#List comprehension offers a shorter syntax when you want to create a new list 
#based on the values of an existing list.
#newlist = [expression returned for item in iterable if condition == True]
best_list = [name for name in names if len(name) >= 6]
print(best_list)


['Kramer', 'Elaine', 'George', 'Newman']
['Kramer', 'Elaine', 'George', 'Newman']


In [11]:
#####################################
## Built-in practice: range()    ####
#####################################
  
# Create a range object that goes from 0 to 5
nums = range(6)
print(type(nums))

# Convert nums to a list
nums_list = list(nums)
print(nums_list)

# Create a new list of odd numbers from 1 to 11 by unpacking a range object
nums_list2 = [*range(1,12,2)]
print(nums_list2)
    
#You can convert the _range object_ into a list by using the list() function 
#or by unpacking it into a list using the star character (*)


<class 'range'>
[0, 1, 2, 3, 4, 5]
[1, 3, 5, 7, 9, 11]


In [13]:
#####################################
#Built-in practice: enumerate()   ###
#####################################
#you have a list of people that arrived at a party you are hosting. 
#The list is ordered by arrival (Jerry was the first to arrive, followed by Kramer, etc.):
#if you wanted to attach an index representing a person's arrival order

names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

#you can use a for loop: Not efficient
indexed_names = []
for i in range(len(names)):
    index_name = (i, names[i])
    indexed_names.append(index_name)
print(indexed_names)

##### Rewrite the for loop to use enumerate
############################################
indexed_names = []
for i,name in enumerate(names):
    index_name = (i,name)
    indexed_names.append(index_name) 
print(indexed_names)

# Rewrite the above for loop using list comprehension
indexed_names_comp = [(i,name) for i,name in enumerate(names)]
print(indexed_names_comp)

# Unpack an enumerate object with a starting index of one
indexed_names_unpack = [*enumerate(names,1)] ####start index at 1
print(indexed_names_unpack)


[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(1, 'Jerry'), (2, 'Kramer'), (3, 'Elaine'), (4, 'George'), (5, 'Newman')]


In [16]:
#####################################
#Built-in practice: map()         ###
#####################################
#create a new list called names_uppercase that converted all the letters in each name to uppercase. 
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

#you can use a for loop: Not efficient
names_uppercase = []
for name in names:
    names_uppercase.append(name.upper())
print(names_uppercase)

# Use map to apply str.upper to each element in names
#########################################################
names_map  = map(str.upper, names)

# Print the type of the names_map
print(type(names_map))

# Unpack names_map into a list
names_uppercase = [*names_map]

# Print the list created above
print(names_uppercase)



['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']
<class 'map'>
['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']


In [19]:
#####################################
##Practice with NumPy arrays   #####
#####################################
#broadcasting refers to a numpy array's ability to vectorize operations, 
#so they are performed on all elements of an object at once.

import numpy as np
nums=np.array([[ 1,  2,  3,  4,  5],
               [ 6,  7,  8,  9, 10]])
print(nums)

# Print second row of nums
print(nums[1,:])

# Print all elements of nums that are greater than six
print(nums[nums > 6])

# Double every element of nums
nums_dbl = nums * 2
print(nums_dbl)

# Replace the third column of nums
nums[:,2] = nums[:,2] + 1
print(nums)


[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
[ 6  7  8  9 10]
[ 7  8  9 10]
[[ 2  4  6  8 10]
 [12 14 16 18 20]]
[[ 1  2  4  4  5]
 [ 6  7  9  9 10]]


In [49]:
#####################################
##Bringing it all together        ###
#####################################
#Each guest has decided to show up to the party in 10-minute increments.
#welcome each of the guests and let them know how many minutes late they are to the party.

names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']
# Create a list of arrival times by unpacking the range object
arrival_times = [*range(10, 60, 10)] 
print(arrival_times)

#You realize your clock is three minutes fast
# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3
print(new_times)

# Use list comprehension and enumerate to pair guests to new times
guest_arrivals = [(names[index],time) for index,time in enumerate(new_times)]

print(guest_arrivals, '\n')

#Function welcome_guest
#########################
def welcome_guest(guest_arrivals):
    guest_and_time = tuple(guest_arrivals)
    for i in range(len(guest_and_time)):
        welcome_string = "Welcome to Festivus " + str(guest_and_time[i]) + "... You\'re " + str(guest_and_time[i]) + " min late."
    return  welcome_string 

# Map the welcome_guest function to each (guest,time) pair
welcome_map = map(welcome_guest, guest_arrivals)

guest_welcomes = [*welcome_map]
print(*guest_welcomes, sep='\n')



[10, 20, 30, 40, 50]
[ 7 17 27 37 47]
[('Jerry', 7), ('Kramer', 17), ('Elaine', 27), ('George', 37), ('Newman', 47)] 

Welcome to Festivus 7... You're 7 min late.
Welcome to Festivus 17... You're 17 min late.
Welcome to Festivus 27... You're 27 min late.
Welcome to Festivus 37... You're 37 min late.
Welcome to Festivus 47... You're 47 min late.


In [7]:
#####################################
##Using %timeit                   ###
#####################################
#create a list of integers from 0 to 50 using the range() function. 
#is using list comprehension or unpacking the range object into a list faster. 
#Let's use %timeit to find the best implementation.

# Create a list of integers (0-50) using list comprehension
nums_list_comp = [num for num in range(51)]
print(nums_list_comp)

# Create a list of integers (0-50) by unpacking range
nums_unpack = [*range(51)]
print(nums_unpack)

#########################################################
##Using %timeit: specifying number of runs and loops  ###
#########################################################
%timeit -r5 -n25 [num for num in range(51)]
%timeit -r5 -n25 [*range(51)]

#Although list comprehension is a useful and powerful tool, 
#sometimes unpacking an object can save time and looks a little cleaner.


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
1.12 µs ± 16.7 ns per loop (mean ± std. dev. of 5 runs, 25 loops each)
353 ns ± 9.97 ns per loop (mean ± std. dev. of 5 runs, 25 loops each)


In [10]:
#########################################################
##Using %timeit: formal name or literal syntax  ###
#########################################################
#create data structures using either a formal name or a literal syntax. 
#explore how using a literal syntax for creating a data structure can speed up runtimes.

# Create a list using the formal name
formal_list = list()
print(formal_list)

# Create a list using the literal syntax
literal_list = []
print(literal_list)

# Print out the type of formal_list
print(type(formal_list))

# Print out the type of literal_list
print(type(literal_list))


%timeit list()
%timeit []

#Using Python's literal syntax to define a data structure can speed up your runtime. 
#Consider using the literal syntaxes (like [] instead of list(), {} instead of dict(), or () instead of tuple()), 
#to gain some speed.


[]
[]
<class 'list'>
<class 'list'>
49.8 ns ± 0.481 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
15.3 ns ± 0.0846 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)


In [19]:
%%timeit hero_wts_lbs = [] 
wts = [441.0, 65.0, 90.0, 441.0, 122.0, 88.0, 61.0, 81.0, 104.0, 108.0]
for wt in wts: 
    hero_wts_lbs.append(wt * 2.20462)

# convert these weights into pounds.

##############################################################
##Using %%timeit | keep it as the first line in your code  ###
##############################################################

#%%timeit wts_np = np.array(wts)
#hero_wts_lbs_np = wts_np * 2.20462   


#use %%timeit (_cell magic mode_) to time multiple lines of code. 
#Converting the wts list into a NumPy array and taking advantage of NumPy array broadcasting saved some time.
#Moving forward, remember that you can use %timeit to gather runtime for a single line of code (_line magic mode_)
#and %%timeit to get the runtime for multiple lines of code.



692 ns ± 9.32 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [64]:
##########################################
##Code Profiling | Profile a function  ###
##########################################
import numpy as np

heroes = ['Batman', 'Superman', 'Wonder Woman']
hts = np.array([188.0, 191.0, 183.0])
wts = np.array([95.0, 101.0, 71.0])

def convert_units(heroes, heights, weights):

    new_hts = [ht * 0.39370  for ht in heights]
    new_wts = [wt * 2.20462  for wt in weights]

    hero_data = {}

    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])

    return hero_data

convert_units(heroes, hts, wts)

######################################
##Estimated runtime of this function   
######################################
%timeit convert_units(heroes, hts, wts)
#gives the total execution time
#how long each line took to run?
#we can use %timeit on each line
#instead profile the function with the line_profiler package

#!pip install line_profiler

#spot bottlenecks: When you see certain lines of code taking up the majority of the function's runtime, 
#it is an indication that you may want to deploy a different, more efficient technique.

#%reload_ext line_profiler #to reinstall the profiler
%lprun -f convert_units convert_units(heroes, hts, wts)

#The percentage of time spent on the new_hts list comprehension line of code 
#relative to the total amount of time spent in the convert_units() function is about 50%

#This seems like it may be a potential bottleneck in the function. 
#explore a possible upgrade to make this more efficient.



2.05 µs ± 13.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [65]:
##################################
##  %lprun: fix the bottleneck ###
##################################
#new_hts and new_wts account for high percentages of the runtime
#This is an indication to create the new_hts and new_wts objects using a different technique.
#use array broadcasting rather than list comprehension to convert the heights and weights.

def convert_units_broadcast(heroes, heights, weights):
    # Array broadcasting instead of list comprehension
    new_hts = heights * 0.39370
    new_wts = weights * 2.20462

    hero_data = {}

    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])

    return hero_data

#Load the line_profiler package into your IPython session. 
#Then, use %lprun to profile the convert_units_broadcast() function acting on your superheroes data.

#!pip install line_profiler
#to reinstall the profiler
%reload_ext line_profiler   
%lprun -f convert_units_broadcast convert_units_broadcast(heroes, hts, wts)

#By profiling the convert_units() function,
#you saw that using list comprehension was not the most efficient solution 
#for creating the new_hts and new_wts objects.
#You also saw that using array broadcasting in the convert_units_broadcast() function 
#dramatically decreased the percentage of time spent executing these lines of code. 
#You may have noticed that your function still takes a while to iterate through the for loop. 




In [None]:
###################################
####Code Profiling for memory usage
####################################

#import file.py that containt the function 
#from file.py import functionname
#%load_ext memory_file
#%mlrun -f memory_file memory_file(a, b, c)



In [74]:
##############################################
####Combining counting iterating over objects
####Efficient code
##############################################

names = ['Bulbasur','Charmander','Squirtle']
hps = [45, 39, 44]

########################################
###combining with loop
########################################
combined = []

for i,pokemon in enumerate(names):
    combined.append((pokemon, hps[i]))

print(combined)

########################################
###zip more efficient for combining 
########################################
combined_zip = zip(names, hps)
combined_zip_list = [*combined_zip]

print(combined_zip_list)


#######################
###Counting with loop 
#######################
poke_types = ['Grass', 'Dark', 'Fire', 'Grass', 'Dark', 'Fire', 'Grass', 'Dark', 'Fire']
#Standars dictionary approach
type_counts = {}
for poke_type in poke_types:
    if poke_type not in type_counts:
        type_counts[poke_type] = 1
    else:
        type_counts[poke_type] +=1

print(type_counts)

#######################
###Counting with Counter
###More efficient (50% less!)
#######################
from collections import Counter
type_counts_c = Counter(poke_types)

print(type_counts_c)


[('Bulbasur', 45), ('Charmander', 39), ('Squirtle', 44)]
[('Bulbasur', 45), ('Charmander', 39), ('Squirtle', 44)]
{'Grass': 3, 'Dark': 3, 'Fire': 3}
Counter({'Grass': 3, 'Dark': 3, 'Fire': 3})


In [78]:
#############################################################################
####Combinations with loop (where order does not matters)
####and the combinations genarators from itertools for more efficient code
#############################################################################

poke_types = ['Bug','Fire','Ghost','Grass','Water']
combos = []

for x in poke_types:
    for y in poke_types:
        if x == y:
            continue
        if ((x,y) not in combos) & ((y,x) not in combos):
            combos.append((x,y))
print(combos)

####the combinations genarators from itertools
from itertools import combinations
combos_obj = combinations(poke_types, 2)
combos = [*combos_obj]
print(combos)

#if comparing runtimes, using combinations is significantly faster than the nested loop


[('Bug', 'Fire'), ('Bug', 'Ghost'), ('Bug', 'Grass'), ('Bug', 'Water'), ('Fire', 'Ghost'), ('Fire', 'Grass'), ('Fire', 'Water'), ('Ghost', 'Grass'), ('Ghost', 'Water'), ('Grass', 'Water')]
[('Bug', 'Fire'), ('Bug', 'Ghost'), ('Bug', 'Grass'), ('Bug', 'Water'), ('Fire', 'Ghost'), ('Fire', 'Grass'), ('Fire', 'Water'), ('Ghost', 'Grass'), ('Ghost', 'Water'), ('Grass', 'Water')]


In [88]:
########################################
###Combining Pokémon names and types
########################################
from numpy import nan
names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl']
primary_types = ['Grass', 'Psychic', 'Dark', 'Bug', 'Rock']
secondary_types = ['Ice', np.nan, np.nan, np.nan, 'Flying']

#combine each Pokémon's name and primary types with zip
names_type1 = [*zip(names, primary_types)]

print(*names_type1, sep='\n')
print('\n')
# Combine all three lists together
names_types = [*zip(names,primary_types,secondary_types)]

print(*names_types[:5], sep='\n')
print('\n')
# Combine five items from names and three items from primary_types
differing_lengths = [*zip(names[:5], primary_types[:3])]

print(*differing_lengths, sep='\n')
#it will only combine until the smallest lengthed object is exhausted.


('Abomasnow', 'Grass')
('Abra', 'Psychic')
('Absol', 'Dark')
('Accelgor', 'Bug')
('Aerodactyl', 'Rock')


('Abomasnow', 'Grass', 'Ice')
('Abra', 'Psychic', nan)
('Absol', 'Dark', nan)
('Accelgor', 'Bug', nan)
('Aerodactyl', 'Rock', 'Flying')


('Abomasnow', 'Grass')
('Abra', 'Psychic')
('Absol', 'Dark')


In [92]:
########################################
###Counting Pokémon from a sample
########################################
from collections import Counter

names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl']
primary_types = ['Grass', 'Psychic', 'Dark', 'Bug', 'Rock']
generations = [1, 1, 1, 5, 3]

# Collect the count of primary types
type_count = Counter(primary_types)
print(type_count, '\n')

# Collect the count of generations
gen_count = Counter(generations)
print(gen_count, '\n')

# Use list comprehension to get each Pokémon's starting letter
starting_letters = [name[0] for name in names]
print(starting_letters)

# Use list comprehension to get each Pokémon's starting letter
starting_letters = [name[0] for name in names]

# Collect the count of Pokémon for each starting_letter
starting_letters_count = Counter(starting_letters)
print(starting_letters_count)


Counter({'Grass': 1, 'Psychic': 1, 'Dark': 1, 'Bug': 1, 'Rock': 1}) 

Counter({1: 3, 5: 1, 3: 1}) 

['A', 'A', 'A', 'A', 'A']
Counter({'A': 5})


In [94]:
########################################
###Combinations of Pokémon
########################################
pokemon = ['Geodude', 'Cubone', 'Lickitung', 'Persian', 'Diglett']

#we try to catch some of these Pokémon, but the Pokédex can only store two Pokémon at a time
#what the possible pairs of Pokémon are that we could catch.

# Import combinations from itertools
from itertools import combinations

# Create a combination object with pairs of Pokémon
combos_obj = combinations(pokemon, 2)
print(type(combos_obj), '\n')

# Convert combos_obj to a list by unpacking
combos_2 = [*combos_obj]
print(combos_2, '\n')

# Collect all possible combinations of 4 Pokémon directly into a list
combos_4 = [*combinations(pokemon, 4)]
print(combos_4)


<class 'itertools.combinations'> 

[('Geodude', 'Cubone'), ('Geodude', 'Lickitung'), ('Geodude', 'Persian'), ('Geodude', 'Diglett'), ('Cubone', 'Lickitung'), ('Cubone', 'Persian'), ('Cubone', 'Diglett'), ('Lickitung', 'Persian'), ('Lickitung', 'Diglett'), ('Persian', 'Diglett')] 

[('Geodude', 'Cubone', 'Lickitung', 'Persian'), ('Geodude', 'Cubone', 'Lickitung', 'Diglett'), ('Geodude', 'Cubone', 'Persian', 'Diglett'), ('Geodude', 'Lickitung', 'Persian', 'Diglett'), ('Cubone', 'Lickitung', 'Persian', 'Diglett')]


In [106]:
########################################
###Comparing ojects 
########################################
#we can use a nested for loop
list_a = ['Bulbasaur','Charmander','Squirtle']
list_b = ['Caterpie','Pidgey','Squirtle']
in_common = []
for pokemon_a in list_a:
    for pokemon_b in list_b:
        if pokemon_a == pokemon_b:
            in_common.append(pokemon_a)
print(in_common)
#but iterating over each item in both lists is extremely inefficient

#instead use Python's set data type to compare list
set_a = set(list_a)
set_b = set(list_b)
print(set_a, '\n', set_b)

###set method intersection
set_a.intersection(set_b)

###set method difference
#element in a but not b
set_a.difference(set_b)
#element in b but not a
set_b.difference(set_a)

#pokemon that exist in exactly one of the sets but not both
set_a.symmetric_difference(set_b)

#collect all unique Pokemon in either or both sets
set_a.union(set_b)

#Membership testing with sets
#check if a pokemon is in a set, very fast
'Squirtle' in set_a

###Uniques with sets
primary_types = ['Grass', "Psychic", 'Dark', 'Bug']

#Collect the unique Pokemon types within the list
unique_types = []
for prim_type in primary_types:
    if prim_type not in unique_types:
        unique_types.append(prim_type)
        
print('\n', unique_types)

## with a set much easier and faster
unique_types_sets = set(primary_types)
print(unique_types_sets)

['Squirtle']
{'Squirtle', 'Bulbasaur', 'Charmander'} 
 {'Squirtle', 'Pidgey', 'Caterpie'}

 ['Grass', 'Psychic', 'Dark', 'Bug']
{'Psychic', 'Dark', 'Bug', 'Grass'}


In [111]:
####################################################
#Comparing Pokédexes
#what Pokémon they have in common 
#what Pokémon one has that the other does not.
#####################################################
ash_pokedex = ['Pikachu', 'Bulbasaur', 'Koffing', 'Spearow', 'Vulpix', 'Wigglytuff', 'Zubat', 'Rattata', 'Psyduck', 'Squirtle'] 
misty_pokedex = ['Krabby', 'Horsea', 'Slowbro', 'Tentacool', 'Vaporeon', 'Magikarp', 'Poliwag', 'Starmie', 'Psyduck', 'Squirtle']

# Convert both lists to sets
ash_set = set(ash_pokedex)
misty_set = set(misty_pokedex)

# Find the Pokémon that exist in both sets
both = ash_set.intersection(misty_set)
print(both)

# Find the Pokémon that Ash has and Misty does not have
ash_only = ash_set.difference(misty_set)
print(ash_only)

# Find the Pokémon that are in only one set (not both)
unique_to_set = ash_set.symmetric_difference(misty_set)
print(unique_to_set)

#sets lets us do some comparisons between objects without the need to write a for loop.


##########################################################################
#Searching for Pokémon
#see if certain Pokémon are members of either Ash or Brock's Pokédex.
###########################################################################
brock_pokedex = ['Onix','Geodude', 'Zubat', 'Golem', 'Vulpix', 'Tauros', 'Kabutops', 'Omastar', 'Machop', 'Dugtrio']
# Convert Brock's Pokédex to a set
brock_pokedex_set = set(brock_pokedex)
print('\n',brock_pokedex_set)

# Check if Psyduck is in Ash's list and Brock's set
print('Psyduck' in ash_pokedex)
print('Psyduck' in brock_pokedex_set)

# Check if Machop is in Ash's list and Brock's set
print('Machop' in ash_pokedex)
print('Machop' in brock_pokedex_set)

%timeit 'Psyduck' in ash_pokedex
%timeit 'Psyduck' in brock_pokedex_set
%timeit 'Machop' in ash_pokedex
%timeit 'Machop' in brock_pokedex_set
 #Membership testing is much faster when you use sets.

{'Squirtle', 'Psyduck'}
{'Vulpix', 'Zubat', 'Rattata', 'Spearow', 'Koffing', 'Pikachu', 'Wigglytuff', 'Bulbasaur'}
{'Vulpix', 'Bulbasaur', 'Starmie', 'Horsea', 'Zubat', 'Rattata', 'Spearow', 'Vaporeon', 'Koffing', 'Pikachu', 'Magikarp', 'Slowbro', 'Krabby', 'Wigglytuff', 'Poliwag', 'Tentacool'}

 {'Vulpix', 'Onix', 'Golem', 'Dugtrio', 'Geodude', 'Zubat', 'Machop', 'Kabutops', 'Omastar', 'Tauros'}
True
False
False
True
97.3 ns ± 0.408 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
27.1 ns ± 0.216 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
102 ns ± 0.459 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
24.8 ns ± 0.246 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [118]:
####################################################
#Uniques values
####################################################
names = ['Forretress', 'WormadamSandy Cloak', 'Croagunk', 'Mime Jr.', 'Camerupt']
primary_types = ['Bug', 'Bug', 'Poison', 'Psychic', 'Fire']
generations = [2, 4, 4, 4, 3]

#gather unique values from each list
def find_unique_items(data):
    uniques = []

    for item in data:
        if item not in uniques:
            uniques.append(item)

    return uniques

# Use the provided function to collect unique Pokémon names
uniq_names_func = find_unique_items(names)
print(len(uniq_names_func))

# Convert the names list to a set to collect unique Pokémon names
uniq_names_set  = set(names)
print(len(uniq_names_set))

# Check that both unique collections are equivalent
print(sorted(uniq_names_func) == sorted(uniq_names_set))

# Use the best approach to collect unique primary types and generations
uniq_types = set(primary_types) 
uniq_gens = set(generations)
print(uniq_types, uniq_gens, sep='\n') 


%timeit find_unique_items(names)
%timeit set(names)


#Using a set data type to collect unique values is much faster than using a for loop 
#Since a set is defined as a collection of distinct elements, 
#it is an efficient way to collect unique items from an existing object. 


5
5
True
{'Poison', 'Psychic', 'Bug', 'Fire'}
{2, 3, 4}
441 ns ± 6.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
155 ns ± 0.842 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [119]:
####################################################
#Elinimating loop
####################################################

#List for each 3 Pokemons : Health Point, Attack, Defense, Speed
poke_stats = [[90,92,75,60], [25,20,15,90], [65,130,60,75]]

#Sum the total stats for each 3 pokemon
#1with a for loop approach
totals = []
for row in poke_stats:
    totals.append(sum(row))
#2with a list comprehension
totals_comp = [sum(row) for row in poke_stats]
#3with a built-in map() function
totals_map = [*map(sum, poke_stats)]

#%timeit shows that list comp or map() are faster with one line of code
 
import numpy as np    
poke_stats = np.array(poke_stats)
#collect the average stat value for each pokemon (or row) in the array
#1with a for loop approach
avgs = []
for row in poke_stats:
    avg = np.mean(row)
    avgs.append(avg)
print(avgs)
#NumPy arrays can perform calculations on entire arrays all at once
avgs_np = poke_stats.mean(axis=1)
print(avgs_np)

[79.25, 37.5, 82.5]
[79.25 37.5  82.5 ]


In [120]:
####################################################
#Elinimating loop | Gathering Pokémon without a loop
####################################################

poke_names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl']
poke_gens = [4, 1, 2, 5, 1]

#filter the Pokémon that belong to generation one or two, 
#and collect the number of letters in each Pokémon's name

gen1_gen2_name_lengths_loop = []

for name,gen in zip(poke_names, poke_gens):
    if gen < 3:
        name_length = len(name)
        poke_tuple = (name, name_length)
        gen1_gen2_name_lengths_loop.append(poke_tuple)

####################################################################################
#Eliminate the above for loop using list comprehension and the map() function:   
####################################################################################
# Collect Pokémon that belong to generation 1 or generation 2
gen1_gen2_pokemon = [name for name,gen in zip(poke_names, poke_gens) if gen < 3]
# Create a map object that stores the name lengths
name_lengths_map = map(len, gen1_gen2_pokemon)
# Combine gen1_gen2_pokemon and name_lengths_map into a list
gen1_gen2_name_lengths = [*zip(gen1_gen2_pokemon, name_lengths_map)]


print(gen1_gen2_name_lengths_loop[:5])
print(gen1_gen2_name_lengths[:5])

#using list comprehension and the map() function eliminate a for loop, and runtimes is faster

#Even better : replace the entire for loop with one list comprehension: 
[(name, len(name)) for name,gen in zip(poke_names, poke_gens) if gen < 3]
    


[('Abra', 4), ('Absol', 5), ('Aerodactyl', 10)]
[('Abra', 4), ('Absol', 5), ('Aerodactyl', 10)]


In [125]:
################################################################
#Elinimating loop | Pokémon totals and averages without a loop
################################################################
import numpy as np
names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl']
stats = np.array([[ 90,  92,  75,  92,  85,  60],
                 [ 25,  20,  15, 105,  55,  90],
                 [ 65, 130,  60,  75,  60,  75],
                 [ 40,  65,  40,  80,  40,  65],
                 [ 40,  45,  35,  30,  40,  55]])

#gather each Pokémon's total stat value (i.e., the sum of each row in stats) 
#and each Pokémon's average stat value (i.e., the mean of each row in stats) 
#to find the strongest Pokémon.

poke_list = []

for pokemon,row in zip(names, stats):
    total_stats = np.sum(row)
    avg_stats = np.mean(row)
    poke_list.append((pokemon, total_stats, avg_stats))

###Better ###
# Create a total stats array
total_stats_np = stats.sum(axis=1)
# Create an average stats array
avg_stats_np = stats.mean(axis=1)
# Combine names, total_stats_np, and avg_stats_np into a list
poke_list_np = [*zip(names, total_stats_np, avg_stats_np)]


print(poke_list_np == poke_list, '\n')
print(poke_list_np[:3])
print(poke_list[:3], '\n')

top_3 = sorted(poke_list_np, key=lambda x: x[1], reverse=True)[:3]
print('3 strongest Pokémon:\n{}'.format(top_3))    

#using NumPy's .sum() and .mean() methods with a specific axis to eliminate a for loop is much more efficient



True 

[('Abomasnow', 494, 82.33333333333333), ('Abra', 310, 51.666666666666664), ('Absol', 465, 77.5)]
[('Abomasnow', 494, 82.33333333333333), ('Abra', 310, 51.666666666666664), ('Absol', 465, 77.5)] 

3 strongest Pokémon:
[('Abomasnow', 494, 82.33333333333333), ('Absol', 465, 77.5), ('Accelgor', 330, 55.0)]


In [127]:
################################################################
#Writting better loop | Moving calculations above a loop
################################################################
import numpy as np

names = ['Absol', 'Aron', 'Jynx', 'Natu', 'Onix']
attacks = np.array([130, 70, 50, 50, 45])

#print the names of each Pokémon with an attack value greater than the average of all attack values.
total_attack_avg = attacks.mean() 
for pokemon, attack in zip(names, attacks):
    if attack > total_attack_avg:
        print( "{}'s attack: {} > average: {}!"
              .format(pokemon, attack, total_attack_avg) )
        

Absol's attack: 130 > average: 69.0!
Aron's attack: 70 > average: 69.0!


In [132]:
#############################################################################
#Writting better loop | use holistic conversions outside (or below) the loop. 
#############################################################################
names = ['Pikachu', 'Squirtle', 'Articuno']
legend_status = [False, False, True]
generations = [1, 1, 1]

#combine these objects so that each name, status, and generation is stored in an individual list. 
#a list of list
poke_data_tuples = []
for poke_tuple in zip(names, legend_status, generations):
    poke_data_tuples.append(poke_tuple)

poke_data = [*map(list, poke_data_tuples)]
print(poke_data)

[['Pikachu', False, 1], ['Squirtle', False, 1], ['Articuno', True, 1]]


In [133]:
#####################################################
#Writting better loop | One-time calculation loop
#####################################################
generations = [2, 4, 4, 4, 3]

#counts of each generation 
#and determine what percentage each generation accounts for out of the total count of integers.

# Import Counter
from collections import Counter

# Collect the count of each generation
gen_counts = Counter(generations)

# Improve for loop by moving one calculation above the loop
total_count = len(generations)

for gen,count in gen_counts.items():
    gen_percent = round(count / total_count * 100, 2)
    print('generation {}: count = {:3} percentage = {}'
          .format(gen, count, gen_percent))
    

generation 2: count =   1 percentage = 20.0
generation 4: count =   3 percentage = 60.0
generation 3: count =   1 percentage = 20.0


In [134]:
#####################################################
#Writting better loop | Holistic conversion loop
#####################################################
pokemon_types = ['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire', 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock', 'Steel', 'Water']

#gather all the possible pairs of Pokémon types. 
#store each of these pairs in an individual list with an enumerated index as the first element of each list, 
#to see the total number of possible pairs and provide an indexed label for each pair.

# Collect all possible pairs using combinations()
possible_pairs = [*combinations(pokemon_types, 2)]

# Create an empty list called enumerated_tuples
enumerated_tuples = []

# Append each enumerated_pair_tuple to the empty list above
for i,pair in enumerate(possible_pairs, 1):
    enumerated_pair_tuple = (i,) + pair
    enumerated_tuples.append(enumerated_pair_tuple)

# Convert all tuples in enumerated_tuples to a list
enumerated_pairs = [*map(list, enumerated_tuples)]
print(enumerated_pairs)

#Rather than converting each tuple to a list _within_ the loop, 
#you used the map() function to convert tuples to lists all at once outside of a loop. 



[[1, 'Bug', 'Dark'], [2, 'Bug', 'Dragon'], [3, 'Bug', 'Electric'], [4, 'Bug', 'Fairy'], [5, 'Bug', 'Fighting'], [6, 'Bug', 'Fire'], [7, 'Bug', 'Flying'], [8, 'Bug', 'Ghost'], [9, 'Bug', 'Grass'], [10, 'Bug', 'Ground'], [11, 'Bug', 'Ice'], [12, 'Bug', 'Normal'], [13, 'Bug', 'Poison'], [14, 'Bug', 'Psychic'], [15, 'Bug', 'Rock'], [16, 'Bug', 'Steel'], [17, 'Bug', 'Water'], [18, 'Dark', 'Dragon'], [19, 'Dark', 'Electric'], [20, 'Dark', 'Fairy'], [21, 'Dark', 'Fighting'], [22, 'Dark', 'Fire'], [23, 'Dark', 'Flying'], [24, 'Dark', 'Ghost'], [25, 'Dark', 'Grass'], [26, 'Dark', 'Ground'], [27, 'Dark', 'Ice'], [28, 'Dark', 'Normal'], [29, 'Dark', 'Poison'], [30, 'Dark', 'Psychic'], [31, 'Dark', 'Rock'], [32, 'Dark', 'Steel'], [33, 'Dark', 'Water'], [34, 'Dragon', 'Electric'], [35, 'Dragon', 'Fairy'], [36, 'Dragon', 'Fighting'], [37, 'Dragon', 'Fire'], [38, 'Dragon', 'Flying'], [39, 'Dragon', 'Ghost'], [40, 'Dragon', 'Grass'], [41, 'Dragon', 'Ground'], [42, 'Dragon', 'Ice'], [43, 'Dragon', 'Nor

In [152]:
#####################################################
#Bringing it all together: Pokémon z-scores
#####################################################
import numpy as np
names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl']
hps = np.array([ 80.,  60., 131.,  62.,  71.])
            
#analyze the Health Points using the z-score to see 
#how many standard deviations each Pokémon's HP is from the mean of all HPs.

# Calculate the total HP avg and total HP standard deviation
hp_avg = hps.mean()
hp_std = hps.std()

# Use NumPy to eliminate the previous for loop
z_scores = (hps - hp_avg)/hp_std

# Combine names, hps, and z_scores
poke_zscores2 = [*zip(names, hps, z_scores)]
print(*poke_zscores2[:3], sep='\n')
print('\n')

# Use list comprehension with the same logic as the highest_hp_pokemon code block
highest_hp_pokemon2 = [(names, hps, z_scores) for names, hps, z_scores in poke_zscores2 if z_scores > 1]
print(*highest_hp_pokemon2, sep='\n')

#two loops were eliminated using NumPy broadcasting and list comprehension. 



('Abomasnow', 80.0, -0.030665974986364614)
('Abra', 60.0, -0.7973153496454827)
('Absol', 131.0, 1.9242899303943866)


('Absol', 131.0, 1.9242899303943866)


In [162]:
#####################################################
#Pandas DataFrame iteration
#####################################################
import pandas as pd
import numpy as np
baseball_df = pd.read_csv('baseball_stats.csv')
print(baseball_df.head())
print('\n')
#Calculating a team win percentage (column W)
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)
win_perc = calc_win_perc(50,100)
print(win_perc)

#create a new column that stores each team's win percentage for a season
win_perc_list = []
for i in range(len(baseball_df)):
    row = baseball_df.iloc[i]
    wins = row['W']
    games_played = row['G']
    win_perc = calc_win_perc(wins, games_played)
    win_perc_list.append(win_perc)
baseball_df['WP'] = win_perc_list

print(baseball_df.head())

###But looping over the DataFrame with .iloc is efficient?
#%%timeit before the function

##################################
#Loop over a DataFrame
#faster half the time .iloc takes
##################################
win_perc_list = []
for i,row in baseball_df.iterrows():
     wins = row['W']
    games_played = row['G']
    win_perc = calc_win_perc(wins, games_played)
    win_perc_list.append(win_perc)
baseball_df['WP'] = win_perc_list


  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG  
0           NaN  162  0.317  0.415  
1           5.0  162  0.306  0.378  
2           4.0  162  0.315  0.403  
3           NaN  162  0.331  0.428  
4           NaN  162  0.335  0.424  


0.5
  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.

In [169]:
#####################################################
#Iterating with .iterrows()
#####################################################

pit = { 'Team': ['PIT', 'PIT', 'PIT', 'PIT', 'PIT'],
        'League': ['NL', 'NL', 'NL', 'NL', 'NL'],
        'Year': [2012, 2011, 2010, 2009, 2008],
        'RS': [651, 610, 587, 636, 735], 
        'RA': [674, 712, 866, 768, 884], 
        'W': [79, 72, 57, 62, 67], 
        'G': [162, 162, 162, 161, 162], 
        'Playoffs' : [0, 0, 0, 0, 0]
       }
  
# Create DataFrame
pit_df = pd.DataFrame(pit)

# Iterate over pit_df and print each index variable and then each row
for i,row in pit_df.iterrows():
    print(i)
    #print(row)
    print(type(row))

# Iterate over pit_df and print each row
for i,row in pit_df.iterrows():
    print(type(row))
    
# Use one variable instead of two to store the result of .iterrows()
for row_tuple in pit_df.iterrows():
    print(type(row_tuple))  
    
#using i,row, access things from the row using square brackets (row['Team']). 
#using row_tuple, specify which element of the tuple you'd like to access 
#before grabbing the team name (row_tuple[1]['Team']).
#using .iterrows() will still be faster than using .iloc 


0
<class 'pandas.core.series.Series'>
1
<class 'pandas.core.series.Series'>
2
<class 'pandas.core.series.Series'>
3
<class 'pandas.core.series.Series'>
4
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [171]:
#####################################################
#Run differentials with .iterrows()
#####################################################
#the run differential for each season from the year 2008 to 2012 is calculated by 
#subtracting the total number of runs a team allowed in a season 
#from the team's total number of runs scored in a season. 
#'RS' means runs scored and 'RA' means runs allowed.

def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

giants = { 'Team': ['SFG', 'SFG', 'SFG', 'SFG', 'SFG'],
              'League': ['NL', 'NL', 'NL', 'NL', 'NL'],
              'Year': [2012, 2011, 2010, 2009, 2008],
              'RS': [718, 570, 697, 657, 640], 
              'RA': [649, 578, 583, 611, 759], 
              'W': [94, 86, 92, 88, 72], 
              'G': [162, 162, 162, 161, 162], 
              'Playoffs' : [1, 0, 1, 0, 0]
            }
# Create DataFrame
giants_df = pd.DataFrame(giants)

# Create an empty list to store run differentials
run_diffs = []

# Write a for loop and collect runs allowed and runs scored for each row
for i,row in giants_df.iterrows():
    runs_scored = row['RS']
    runs_allowed = row['RA']
    
    # Use the provided function to calculate run_diff for each row
    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    # Append each run differential to the output list
    run_diffs.append(run_diff)

giants_df['RD'] = run_diffs
print(giants_df)



  Team League  Year   RS   RA   W    G  Playoffs   RD
0  SFG     NL  2012  718  649  94  162         1   69
1  SFG     NL  2011  570  578  86  162         0   -8
2  SFG     NL  2010  697  583  92  162         1  114
3  SFG     NL  2009  657  611  88  161         0   46
4  SFG     NL  2008  640  759  72  162         0 -119


In [180]:
#####################################################
#.itertuples()
#####################################################
import pandas as pd
import numpy as np
baseball_df = pd.read_csv('baseball_stats.csv')
team_wins_df = baseball_df[['Team', 'Year', 'W']]
print(team_wins_df.head())

for row_tuple in team_wins_df.iterrows():
    #print(row_tuple)
    #print(type(row_tuple[1]))
    row_tuple
#last one
print(row_tuple[1]['Team'])

for row_namedtuple in team_wins_df.itertuples():
    row_namedtuple
#last one
print(row_namedtuple.Team)
    
    

  Team  Year   W
0  ARI  2012  81
1  ATL  2012  94
2  BAL  2012  93
3  BOS  2012  69
4  CHC  2012  61
WSA
WSA


In [191]:
#####################################################
#Iterating with .itertuples()
#####################################################
baseball_df = pd.read_csv('baseball_stats.csv')
rangers_df = baseball_df[baseball_df['Team'] == 'TEX'].reset_index(drop = True) 
print(rangers_df.head())

# Loop over the DataFrame and print each row
for row in rangers_df.itertuples():
  #print(row)
  i = row.Index
  year = row.Year
  wins = row.W
  print(i, year, wins)

  # Check if rangers made Playoffs (1 means yes; 0 means no)
  if row.Playoffs == 1:
    print(row.Index, row.Year, row)

#using .itertuples(), use the dot syntax for referencing an attribute in a namedtuple.


  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  TEX     AL  2012  808  707  93  0.334  0.446  0.273         1         5.0   
1  TEX     AL  2011  855  677  96  0.340  0.460  0.283         1         3.0   
2  TEX     AL  2010  787  687  90  0.338  0.419  0.276         1         7.0   
3  TEX     AL  2009  784  740  87  0.320  0.445  0.260         0         NaN   
4  TEX     AL  2008  901  967  79  0.354  0.462  0.283         0         NaN   

   RankPlayoffs    G   OOBP   OSLG  
0           5.0  162  0.309  0.408  
1           2.0  162  0.307  0.392  
2           2.0  162  0.319  0.390  
3           NaN  162  0.331  0.416  
4           NaN  162  0.362  0.455  
0 2012 93
0 2012 Pandas(Index=0, Team='TEX', League='AL', Year=2012, RS=808, RA=707, W=93, OBP=0.334, SLG=0.446, BA=0.273, Playoffs=1, RankSeason=5.0, RankPlayoffs=5.0, G=162, OOBP=0.309, OSLG=0.408)
1 2011 96
1 2011 Pandas(Index=1, Team='TEX', League='AL', Year=2011, RS=855, RA=677, W=96, OBP=0.

In [198]:
#####################################################
#Run differentials with .itertuples()
#####################################################
baseball_df = pd.read_csv('baseball_stats.csv')
yankees_df = baseball_df[baseball_df['Team'] == 'NYY'].reset_index(drop = True) 
print(yankees_df.head())

def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

run_diffs = []

# Loop over the DataFrame and calculate each row's run differential
for row in yankees_df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    run_diff = calc_run_diff(row.RS, row.RA)
    run_diffs.append(run_diff)

# Append new column
yankees_df['RD'] = run_diffs
yankees_df.sort_values(by=['RD'], ascending=False)
#print(yankees_df)


  Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  RankSeason  \
0  NYY     AL  2012  804  668   95  0.337  0.453  0.265         1         3.0   
1  NYY     AL  2011  867  657   97  0.343  0.444  0.263         1         2.0   
2  NYY     AL  2010  859  693   95  0.350  0.436  0.267         1         3.0   
3  NYY     AL  2009  915  753  103  0.362  0.478  0.283         1         1.0   
4  NYY     AL  2008  789  727   89  0.342  0.427  0.271         0         NaN   

   RankPlayoffs    G   OOBP   OSLG  
0           3.0  162  0.311  0.419  
1           4.0  162  0.322  0.399  
2           3.0  162  0.322  0.399  
3           1.0  162  0.327  0.408  
4           NaN  162  0.329  0.405  


Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RD
14,NYY,AL,1998,965,656,114,0.364,0.46,0.288,1,1.0,1.0,162,,,309
1,NYY,AL,2011,867,657,97,0.343,0.444,0.263,1,2.0,4.0,162,0.322,0.399,210
15,NYY,AL,1997,891,688,96,0.362,0.436,0.287,1,3.0,4.0,162,,,203
10,NYY,AL,2002,897,697,103,0.354,0.455,0.275,1,1.0,4.0,161,0.309,0.395,200
5,NYY,AL,2007,968,777,94,0.366,0.463,0.29,1,2.0,4.0,162,0.34,0.417,191
32,NYY,AL,1977,831,651,100,0.344,0.444,0.281,1,3.0,1.0,162,,,180
25,NYY,AL,1985,839,660,97,0.344,0.425,0.267,0,,,161,,,179
13,NYY,AL,1999,900,731,98,0.366,0.453,0.282,1,3.0,1.0,162,0.329,0.4,169
45,NYY,AL,1963,714,547,104,0.309,0.403,0.252,1,1.0,2.0,161,,,167
2,NYY,AL,2010,859,693,95,0.35,0.436,0.267,1,3.0,3.0,162,0.322,0.399,166


In [199]:
#####################################################
#Aternative to looping | .apply()
#####################################################
baseball_df = pd.read_csv('baseball_stats.csv')
def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

run_diffs_apply = baseball_df.apply(lambda row: calc_run_diff(row['RS'], row['RA']), axis=1)
baseball_df['RD'] = run_diffs_apply
print(baseball_df)

     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
0     ARI     NL  2012  734  688   81  0.328  0.418  0.259         0   
1     ATL     NL  2012  700  600   94  0.320  0.389  0.247         1   
2     BAL     AL  2012  712  705   93  0.311  0.417  0.247         1   
3     BOS     AL  2012  734  806   69  0.315  0.415  0.260         0   
4     CHC     NL  2012  613  759   61  0.302  0.378  0.240         0   
...   ...    ...   ...  ...  ...  ...    ...    ...    ...       ...   
1227  PHI     NL  1962  705  759   81  0.330  0.390  0.260         0   
1228  PIT     NL  1962  706  626   93  0.321  0.394  0.268         0   
1229  SFG     NL  1962  878  690  103  0.341  0.441  0.278         1   
1230  STL     NL  1962  774  664   84  0.335  0.394  0.271         0   
1231  WSA     AL  1962  599  716   60  0.308  0.373  0.250         0   

      RankSeason  RankPlayoffs    G   OOBP   OSLG   RD  
0            NaN           NaN  162  0.317  0.415   46  
1            4.0     

In [202]:
#####################################################
#Analyzing baseball stats with .apply()
#####################################################
rays = { 'RS': [697, 707, 802, 803, 774], 
         'RA': [577, 614, 649, 754, 671], 
         'W': [90, 91, 96, 84, 97], 
         'Playoffs' : [0, 1, 1, 0, 1]
       }
# Create DataFrame
rays_df = pd.DataFrame(rays, index=[2012,2011,2010,2009,2008])
  
# Gather sum of all columns
stat_totals = rays_df.apply(sum, axis=0)
print(stat_totals)

# Gather total runs scored in all games per year
total_runs_scored = rays_df[['RS', 'RA']].apply(sum, axis=1)
print(total_runs_scored)

def text_playoffs(num_playoffs): 
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No' 

# Convert numeric playoffs to text by applying text_playoffs()
textual_playoffs = rays_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs)

#The .apply() method allows applying functions to all rows or columns of a DataFrame by specifying an axis.
#a better way to find these stats would use the pandas built-in .sum() method.
#rays_df.sum(axis=0) to get columnar sums and rays_df[['RS', 'RA']].sum(axis=1) to get row sums.
#rays_df['Playoffs'].apply(text_playoffs) to convert the 'Playoffs' column to text.


RS          3783
RA          3265
W            458
Playoffs       3
dtype: int64
2012    1274
2011    1321
2010    1451
2009    1557
2008    1445
dtype: int64
2012     No
2011    Yes
2010    Yes
2009     No
2008    Yes
dtype: object


In [206]:
#####################################################
#Settle a debate with .apply()
#####################################################
#One manager claims that the team has made the playoffs every year 
#they have had a win percentage of 0.50 or greater. 
#Another manager says this is not true.

def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

baseball_df = pd.read_csv('baseball_stats.csv')
dbacks_df = baseball_df[baseball_df['Team'] == 'ARI'].reset_index(drop = True) 
print(dbacks_df)

# Create a win percentage Series 
win_percs = dbacks_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
print(win_percs, '\n')

# Append a new column to dbacks_df
dbacks_df['WP'] = win_percs
print(dbacks_df, '\n')

# Display dbacks_df where WP is greater than 0.50
print(dbacks_df[dbacks_df['WP'] >= 0.50])

#Using the .apply() method with a lambda function allows 
#to apply a function to a DataFrame without the need to write a for loop.

#The second manager was correct. 
#In the year 2012, 2008, 2003, and 2000 the ARI team had a win percentage greater than or equal to 0.50, 
#but still did not make the playoffs.


   Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
0   ARI     NL  2012  734  688   81  0.328  0.418  0.259         0   
1   ARI     NL  2011  731  662   94  0.322  0.413  0.250         1   
2   ARI     NL  2010  713  836   65  0.325  0.416  0.250         0   
3   ARI     NL  2009  720  782   70  0.324  0.418  0.253         0   
4   ARI     NL  2008  720  706   82  0.327  0.415  0.251         0   
5   ARI     NL  2007  712  732   90  0.321  0.413  0.250         1   
6   ARI     NL  2006  773  788   76  0.331  0.424  0.267         0   
7   ARI     NL  2005  696  856   77  0.332  0.421  0.256         0   
8   ARI     NL  2004  615  899   51  0.310  0.393  0.253         0   
9   ARI     NL  2003  717  685   84  0.330  0.417  0.263         0   
10  ARI     NL  2002  819  674   98  0.346  0.423  0.267         1   
11  ARI     NL  2001  818  677   92  0.341  0.442  0.267         1   
12  ARI     NL  2000  792  754   85  0.333  0.429  0.265         0   
13  ARI     NL  1999

In [214]:
#####################################################
#Optimal pandas iterating | Using Numpy
#####################################################
baseball_df = pd.read_csv('baseball_stats.csv')

#Collecting DataFrame column values into an array
wins_np = baseball_df['W'].values
print(type(wins_np))
print(wins_np)

run_diffs_np = baseball_df['RS'].values - baseball_df['RA'].values
baseball_df['RD'] = run_diffs_np
print(baseball_df)


<class 'numpy.ndarray'>
[ 81  94  93 ... 103  84  60]
     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
0     ARI     NL  2012  734  688   81  0.328  0.418  0.259         0   
1     ATL     NL  2012  700  600   94  0.320  0.389  0.247         1   
2     BAL     AL  2012  712  705   93  0.311  0.417  0.247         1   
3     BOS     AL  2012  734  806   69  0.315  0.415  0.260         0   
4     CHC     NL  2012  613  759   61  0.302  0.378  0.240         0   
...   ...    ...   ...  ...  ...  ...    ...    ...    ...       ...   
1227  PHI     NL  1962  705  759   81  0.330  0.390  0.260         0   
1228  PIT     NL  1962  706  626   93  0.321  0.394  0.268         0   
1229  SFG     NL  1962  878  690  103  0.341  0.441  0.278         1   
1230  STL     NL  1962  774  664   84  0.335  0.394  0.271         0   
1231  WSA     AL  1962  599  716   60  0.308  0.373  0.250         0   

      RankSeason  RankPlayoffs    G   OOBP   OSLG   RD  
0            NaN        

In [217]:
###################################################################
#Optimal pandas iterating | Replacing .iloc with underlying arrays
###################################################################
baseball_df = pd.read_csv('baseball_stats.csv')
baseball_df = baseball_df[['Team', 'League', 'Year', 'RS', 'RA','W', 'G', 'Playoffs']]

# Use the W array and G array to calculate win percentages
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)

# Append a new column to baseball_df that stores all win percentages
baseball_df['WP'] = win_percs_np

print(baseball_df.head())

#Using a DataFrame's underlying arrays to perform calculations 
#can really speed up your code and yields some significant efficiency gain

###################################################################
#Bringing it all together: Predict win percentage
###################################################################
#predict a team's win percentage for a given season 
#by using the team's total runs scored in a season ('RS') 
#and total runs allowed in a season ('RA') with the following function:
def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

win_perc_preds_loop = []

# Use a loop and .itertuples() to collect each row's predicted win percentage
for row in baseball_df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df['WP_preds'] = win_perc_preds_np
print(baseball_df.head())


  Team League  Year   RS   RA   W    G  Playoffs    WP
0  ARI     NL  2012  734  688  81  162         0  0.50
1  ATL     NL  2012  700  600  94  162         1  0.58
2  BAL     AL  2012  712  705  93  162         1  0.57
3  BOS     AL  2012  734  806  69  162         0  0.43
4  CHC     NL  2012  613  759  61  162         0  0.38
  Team League  Year   RS   RA   W    G  Playoffs    WP  WP_preds
0  ARI     NL  2012  734  688  81  162         0  0.50      0.53
1  ATL     NL  2012  700  600  94  162         1  0.58      0.58
2  BAL     AL  2012  712  705  93  162         1  0.57      0.50
3  BOS     AL  2012  734  806  69  162         0  0.43      0.45
4  CHC     NL  2012  613  759  61  162         0  0.38      0.39
