In [4]:
import pandas as pd
import numpy as np

### enumerate()

Your task is, given a string, to define the function retrieve_character_indices() that creates a dictionary character_indices, where each key represents a unique character from the string and the corresponding value is a list containing the indices/positions of this letter in the string.

For example, passing the string 'ukulele' to the retrieve_character_indices() function should result in the following output: {'e': [4, 6], 'k': [1], 'l': [3, 5], 'u': [0, 2]}.

For this task, you are not allowed to use any string methods!

In [1]:
def retrieve_character_indices(string):
    character_indices = dict()
    # Define the 'for' loop
    for index, character in enumerate(string):
        # Update the dictionary if the key already exists
        if character in character_indices:
            character_indices[character].append(index)
        # Update the dictionary if the key is absent
        else:
            character_indices[character] = [index]
            
    return character_indices
  
print(retrieve_character_indices('enumerate an Iterable'))

{'e': [0, 4, 8, 15, 20], 'n': [1, 11], 'u': [2], 'm': [3], 'r': [5, 16], 'a': [6, 10, 17], 't': [7, 14], ' ': [9, 12], 'I': [13], 'b': [18], 'l': [19]}


### Iterators:

All Iterables like list, set, or dict must have the associated Iterator. You are given the dictionary pets whose keys are Harry Potter characters and the values are the corresponding creature companions they had. 

Your task is to answer the set of questions regarding the Iterator created from the pets dictionary.

In [2]:
pets = {'Harry': 'Hedwig the owl', 'Hermione': 'Crookshanks the cat', 'Ron': 'Scabbers the rat'}

In [6]:
pets.values()

dict_values(['Hedwig the owl', 'Crookshanks the cat', 'Scabbers the rat'])

### Traversing a DataFrame
Let's iterate through a DataFrame! You are given the heroes DataFrame. This time, it contains only categorical data and no missing values. Create the following dictionary from this dataset:

    . Each key is a column name.
    . Each value is another dictionary:
        . Each key is a unique category from the column.
        . Each value is the amount of heroes falling into this category.

Tip: a Series object is also an Iterable. It traverses through the values it stores when you put it in a for loop or pass it to list(), tuple(), or set() initializers.

In [30]:
heroes = pd.read_csv('heroes_information.csv',index_col=['name'],usecols=['name','Gender','Eye color','Race','Hair color','Publisher','Skin color','Alignment'])
heroes.head()

Unnamed: 0_level_0,Gender,Eye color,Race,Hair color,Publisher,Skin color,Alignment
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A-Bomb,Male,yellow,Human,No Hair,Marvel Comics,-,good
Abe Sapien,Male,blue,Icthyo Sapien,No Hair,Dark Horse Comics,blue,good
Abin Sur,Male,blue,Ungaran,No Hair,DC Comics,red,good
Abomination,Male,green,Human / Radiation,No Hair,Marvel Comics,-,bad
Abraxas,Male,blue,Cosmic Entity,Black,Marvel Comics,-,bad


In [32]:
column_counts = dict()

# Traverse through the columns in the heroes DataFrame
for column_name, series in heroes.iteritems():
    # Retrieve the values stored in series in a list form
    values = list(series)
    category_counts = dict()  
    # Traverse through unique categories in values
    for category in set(values):
        # Count the appearance of category in values
        category_counts[category] = values.count(category)
    
    column_counts[column_name] = category_counts
    
print(column_counts)

{'Gender': {'Male': 505, 'Female': 200, '-': 29}, 'Eye color': {'gold': 3, 'white': 17, 'white / red': 1, 'yellow (without irises)': 2, 'yellow / blue': 1, 'yellow / red': 1, 'indigo': 1, 'yellow': 19, 'silver': 1, '-': 172, 'grey': 6, 'blue / white': 1, 'red': 46, 'amber': 2, 'purple': 4, 'black': 23, 'violet': 2, 'hazel': 6, 'brown': 127, 'green': 73, 'green / blue': 1, 'blue': 225}, 'Race': {'Spartoi': 1, 'Luphomoid': 1, 'Metahuman': 2, 'Gungan': 1, 'Kryptonian': 7, 'Zombie': 1, 'Human / Radiation': 11, 'God / Eternal': 14, 'Animal': 4, 'Xenomorph XX121': 1, 'Alien': 7, 'Alpha': 5, 'Yautja': 1, 'Human-Vuldarian': 1, 'Bolovaxian': 1, 'Demon': 6, 'Symbiote': 9, 'Amazon': 2, 'Atlantean': 5, 'Human / Cosmic': 2, 'Human / Altered': 3, 'Kaiju': 1, 'Gorilla': 1, 'Strontian': 1, 'Ungaran': 1, 'Eternal': 2, 'Dathomirian Zabrak': 1, 'Tamaranean': 1, 'Rodian': 1, 'Human-Kree': 2, 'Parademon': 1, 'Saiyan': 2, 'Human-Spartoi': 1, '-': 304, 'Icthyo Sapien': 1, 'Zen-Whoberian': 1, 'Asgardian': 5, 

###  Basic list comprehensions
For this task, you will have to create a bag-of-words representation of the spam email stored in the spam variable (you can explore the content using the shell). Recall that bag-of-words is simply a counter of unique words in a given text. This representation can be further used for text classification, e.g. for spam detection (given enough training examples).

We created a small auxiliary function create_word_list() to help you split a string into words, e.g. applying it to 'To infinity... and beyond!' will return ['To', 'infinity', 'and', 'beyond'].

In [37]:
spam = """Dear User,

Our Administration Team needs to inform you that you are reaching the storage limit of your Mailbox account.
You have to verify your account within the next 24 hours.
Otherwise, it will not be possible to use the service.
Please, click on the link below to verify your account and continue using our service.

Your Administration Team."""

In [41]:
# Convert the text to lower case and create a word list
words = spam.lower().split()

# Create a set storing only unique words
word_set = set(words)

# Create a dictionary that counts each word in the list
tuples = [(word, words.count(word)) for word in word_set]
word_counter = dict(tuples)

# Printing words that appear more than once
for (key, value) in word_counter.items():
    if value > 1:
        print("{}: {}".format(key, value))

you: 3
our: 2
to: 4
account: 2
service.: 2
verify: 2
administration: 2
your: 4
the: 4


### Prime number sequence
A prime number is a positive number that is divisible only by 1 or itself (e.g. 3, 7, 11 etc.). However, 1 is not a prime number.

Your task is, given a list of candidate numbers cands, to filter only prime numbers in a new list primes.

But first, you need to create a function is_prime() that returns True if the input number n is prime or False, otherwise. A number is prime if it is not divisible by any integer number from 2 to √n (any number n is not divisible by anything higher than √n).

In [44]:
import math

In [45]:
cands = [1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49]

In [46]:
def is_prime(n):
    # Define the initial check
    if n < 2:
        return False
    # Define the loop checking if a number is not prime
    for i in range(2, int(math.sqrt(n)) + 1):
        if n % i == 0:
            return False
    return True
    
# Filter prime numbers into the new list
primes = [num for num in cands if is_prime(num)]
print("primes = " + str(primes))

primes = [5, 13, 17, 29, 37, 41]


### Coprime number sequence
Two numbers a and b are coprime if their Greatest Common Divisor (GCD) is 1. GCD is the largest positive number that divides two given numbers a and b. For example, the numbers 7 and 9 are coprime because their GCD is 1.

Given two lists list1 and list2, your task is to create a new list coprimes that contains all the coprime pairs from list1 and list2.

But first, you need to write a function for the GCD using the following algorithm:

    check if b=0
    if true, return a as the GCD between a and b
    if false, go to step 2
    make a substitution a←b and b←a%b
    go back to step 1

In [48]:
list1 = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]
list2 = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98]

In [52]:
def gcd(a, b):
    # Define the while loop as described
    while b != 0:
        a,b = b, a%b
        # OR
        # temp_a = a
        # a=b
        # b=a%b
        print(a)
        print(b)
    # Complete the return statement
    return a
    
# Create a list of tuples defining pairs of coprime numbers
#coprimes = [(i,j) for i in list1 for j in list2 if gcd(i,j)==1]
#print(coprimes)

In [53]:
print(gcd(5,7))

7
5
5
2
2
1
1
0
1


In [54]:
5%7

5

### Combining iterable objects
You are given the list wlist that contains lists of different words. 

Your task is to create a new list of tuples, where each tuple contains a list from the wlist, its length, and the longest word. If there is ambiguity in choosing the longest word, the word with the lowest index in the considered list should be taken into account.

In [55]:
wlist = [['Python', 'creativity', 'universe'], ['interview', 'study', 'job', 'university', 'lecture'], ['task', 'objective', 'aim', 'subject', 'programming', 'test', 'research']]

In [60]:
# Define a function searching for the longest word
def get_longest_word(words):
    longest_word = ''
    for word in words:
        if len(word) > len(longest_word):
            longest_word = word
    return longest_word,len(longest_word)

# Create lists with the lengths and longest words
lengths = [len(item) for item in wlist]
words = [get_longest_word(item) for item in wlist]

# Combine the resulting data into one iterable object
for item in zip(wlist,lengths,words):
    print(item)

(['Python', 'creativity', 'universe'], 3, 'creativity')
(['interview', 'study', 'job', 'university', 'lecture'], 5, 'university')
(['task', 'objective', 'aim', 'subject', 'programming', 'test', 'research'], 7, 'programming')


In [59]:
for word in wlist:
    print(word)

['Python', 'creativity', 'universe']
['interview', 'study', 'job', 'university', 'lecture']
['task', 'objective', 'aim', 'subject', 'programming', 'test', 'research']


#### An alternate method:
    First, put the same calculations into one list comprehension, which should result in a list of tuples. 
    Second, apply the unzip operation to generate two distinct tuples, resembling lengths and words

In [61]:
# Create a list of tuples with lengths and longest words
result = [
    (len(item), get_longest_word(item)) for item in wlist
]

# Unzip the result    
lengths, words = zip(*result)

for item in zip(wlist, lengths, words):
    print(item)

(['Python', 'creativity', 'universe'], 3, 'creativity')
(['interview', 'study', 'job', 'university', 'lecture'], 5, 'university')
(['task', 'objective', 'aim', 'subject', 'programming', 'test', 'research'], 7, 'programming')


### Creating a DataFrame
Create a DataFrame from a dictionary supplied by a zip object. You have to take each single word stored in the list wlist and calculate its length. This data should be stored in two separate tuples that are supplied to the zip() initializer. The resulting zip object should be used to construct a DataFrame where the first column will store words and the second column will store their lengths.

In [2]:
wlist = [['Python', 'creativity', 'universe'], ['interview', 'study', 'job', 'university', 'lecture'], ['task', 'objective', 'aim', 'subject', 'programming', 'test', 'research']]

In [9]:
# Create a list of tuples with words and their lengths
word_lengths = [
    (item, len(item)) for items in wlist for item in items
]

# Unwrap the word_lengths
words, lengths = zip(*word_lengths)

# Create a zip object
col_names = ['word', 'length']
result = zip(col_names, [words, lengths])


# Convert the result to a dictionary and build a DataFrame
data_frame = pd.DataFrame(dict(result))
print(data_frame)

           word  length
0        Python       6
1    creativity      10
2      universe       8
3     interview       9
4         study       5
5           job       3
6    university      10
7       lecture       7
8          task       4
9     objective       9
10          aim       3
11      subject       7
12  programming      11
13         test       4
14     research       8


### Shift a string
You're going to create a generator that, given a string, produces a sequence of constituent characters shifted by a specified number of positions. For example, the string 'sushi' will result in the sequence 'h', 'i', 's', 'u', 's' if we use the shift of 2 positions to the right. If we use the shift of 2 positions to the left (or simply, -2), the resulting sequence will be 's', 'h', 'i', 's', 'u'.

Tip: the % operator might be helpful when indexing your string. Applying it to a positive or negative number gives a non-negative remainder, which can be helpful when shifting your index. For example:

    -2 % 10 = 8 
    -1 % 10 = 9
    0 % 10 = 0
    1 % 10 = 1
    2 % 10 = 2

In [22]:
string = 'sushi'
len_string = len(string)
print(len_string)
for idx in range(0,len_string):
    yield string[(idx-5) % len_string]

5


SyntaxError: 'yield' outside function (<ipython-input-22-4aea7e2ac87e>, line 5)

In [27]:
def shift_string(string, shift):
    len_string = len(string)
    # Define a for loop with the yield statement
    for idx in range(len_string):
        yield string[(idx - shift) % len_string]

# Create a generator
gen = shift_string('DataCamp', 5)

# Create a new string using the generator and print it out
string_shifted = ''.join(gen)
print(string_shifted)

aCampDat


### Throw a dice
Let's create an infinite generator! Your task is to define the simulate_dice_throws() generator. It generates the outcomes of a 6-sided dice tosses in the form of a dictionary out. Each key is a possible outcome (1, 2, 3, 4, 5, 6). Each value is a list: the first value is the amount of realizations of an outcome and the second, the ratio of realizations to the total number of tosses total.

Tip: use the randint() function from the random module. It generates a random integer in the specified interval (e.g. randint(1, 2) can be 1 or 2).

In [39]:
import random

def simulate_dice_throws():
    total, out = 0, dict([(i, [0, 0]) for i in range(1, 7)])
    while True:
        # Simulate a single toss to get a new number
        num = random.randint(1, 6)
        total += 1
        # Update the number and the ratio of realizations
        out[num][0] = out[num][0] + 1
        out[num][1] = round(out[num][0]/total, 2)
        # Yield the updated dictionary
        yield out

# Create the generator and simulate 10 tosses
dice_simulator = simulate_dice_throws()
for i in range(1, 10):
    print(str(i) + ': ' + str(next(dice_simulator)))

1: {1: [0, 0], 2: [0, 0], 3: [0, 0], 4: [0, 0], 5: [1, 1.0], 6: [0, 0]}
2: {1: [0, 0], 2: [0, 0], 3: [0, 0], 4: [0, 0], 5: [1, 1.0], 6: [1, 0.5]}
3: {1: [0, 0], 2: [0, 0], 3: [0, 0], 4: [0, 0], 5: [2, 0.67], 6: [1, 0.5]}
4: {1: [0, 0], 2: [0, 0], 3: [0, 0], 4: [0, 0], 5: [3, 0.75], 6: [1, 0.5]}
5: {1: [0, 0], 2: [1, 0.2], 3: [0, 0], 4: [0, 0], 5: [3, 0.75], 6: [1, 0.5]}
6: {1: [0, 0], 2: [2, 0.33], 3: [0, 0], 4: [0, 0], 5: [3, 0.75], 6: [1, 0.5]}
7: {1: [0, 0], 2: [3, 0.43], 3: [0, 0], 4: [0, 0], 5: [3, 0.75], 6: [1, 0.5]}
8: {1: [0, 0], 2: [3, 0.43], 3: [0, 0], 4: [0, 0], 5: [3, 0.75], 6: [2, 0.25]}
9: {1: [0, 0], 2: [3, 0.43], 3: [1, 0.11], 4: [0, 0], 5: [3, 0.75], 6: [2, 0.25]}


## Generator Comprehension:

### Rewrite the following functions as generator comprehension:

In [42]:
def func1(n):
    for i in range(0, n):
        yield i**2


# Task: rewrite func1() as generator comprehension with n=10

# Solution:
gen = (i**2 for i in range(10))

# check whether the func1() and gen gives same output
for item in zip(gen, func1(10)):
    print(item)

(0, 0)
(1, 1)
(4, 4)
(9, 9)
(16, 16)
(25, 25)
(36, 36)
(49, 49)
(64, 64)
(81, 81)


In [43]:
def func2(n):
    for i in range(0, n):
        if i%2 == 0:
            yield 2*i
            
            
# Task: rewrite func2() as generator comprehension with n=20

# Solution:
gen = (2*i for i in range(20) if i%2==0)

# check whether the func1() and gen gives same output
for item in zip(gen, func2(20)):
    print(item)

(0, 0)
(4, 4)
(8, 8)
(12, 12)
(16, 16)
(20, 20)
(24, 24)
(28, 28)
(32, 32)
(36, 36)


In [49]:
def func3(n, m):
    for i in func1(n):
        for j in func2(m):
            yield ((i, j), i + j)
            
            
# Task: rewrite func3() as generator comprehension with n=8, m=20

# Solution:
gen = (((i,j),i+j) for i in (i**2 for i in range(8)) for j in (2*j for j in range(10) if j%2==0))

# check whether the func1() and gen gives same output
for item in zip(gen, func3(8, 10)):
    print(item)

(((0, 0), 0), ((0, 0), 0))
(((0, 4), 4), ((0, 4), 4))
(((0, 8), 8), ((0, 8), 8))
(((0, 12), 12), ((0, 12), 12))
(((0, 16), 16), ((0, 16), 16))
(((1, 0), 1), ((1, 0), 1))
(((1, 4), 5), ((1, 4), 5))
(((1, 8), 9), ((1, 8), 9))
(((1, 12), 13), ((1, 12), 13))
(((1, 16), 17), ((1, 16), 17))
(((4, 0), 4), ((4, 0), 4))
(((4, 4), 8), ((4, 4), 8))
(((4, 8), 12), ((4, 8), 12))
(((4, 12), 16), ((4, 12), 16))
(((4, 16), 20), ((4, 16), 20))
(((9, 0), 9), ((9, 0), 9))
(((9, 4), 13), ((9, 4), 13))
(((9, 8), 17), ((9, 8), 17))
(((9, 12), 21), ((9, 12), 21))
(((9, 16), 25), ((9, 16), 25))
(((16, 0), 16), ((16, 0), 16))
(((16, 4), 20), ((16, 4), 20))
(((16, 8), 24), ((16, 8), 24))
(((16, 12), 28), ((16, 12), 28))
(((16, 16), 32), ((16, 16), 32))
(((25, 0), 25), ((25, 0), 25))
(((25, 4), 29), ((25, 4), 29))
(((25, 8), 33), ((25, 8), 33))
(((25, 12), 37), ((25, 12), 37))
(((25, 16), 41), ((25, 16), 41))
(((36, 0), 36), ((36, 0), 36))
(((36, 4), 40), ((36, 4), 40))
(((36, 8), 44), ((36, 8), 44))
(((36, 12),