In [2]:
# Loading a Sample Pandas DataFrame
import pandas as pd
df = pd.DataFrame({
    'name': ['James', 'Jane', 'Melissa', 'Ed', 'Neil'],
    'age': [30, 40, 32, 67, 43],
    'score': ['90%', '95%', '100%', '82%', '87%'],
    'age_missing_data': [30, 40, 32, 67, None],
    'income':[100000, 80000, 55000, 62000, 120000]
})
print(df)

      name  age score  age_missing_data  income
0    James   30   90%              30.0  100000
1     Jane   40   95%              40.0   80000
2  Melissa   32  100%              32.0   55000
3       Ed   67   82%              67.0   62000
4     Neil   43   87%               NaN  120000


In [3]:
# Visualizing the Difference Between Vectorization and Scalar Operations
# Scalar Operations (Simplified using a for loop)
length = 0
age_sum = 0
for item in df['age']:
    length += 1
    age_sum += item

average_age_for_loop = age_sum / length

# Vectorized Implementation
average_age_vectorized = df['age'].mean()

In [4]:
# Creating a dictionary of genders
genders = {'James': 'Male', 'Jane': 'Female', 'Melissa': 'Female', 'Ed': 'Male', 'Neil': 'Male'}


In [5]:
# Applying a dictionary to the map method
df['gender'] = df['name'].map(genders)
print(df)

      name  age score  age_missing_data  income  gender
0    James   30   90%              30.0  100000    Male
1     Jane   40   95%              40.0   80000  Female
2  Melissa   32  100%              32.0   55000  Female
3       Ed   67   82%              67.0   62000    Male
4     Neil   43   87%               NaN  120000    Male


In [6]:
# Mapping in a custom function
mean_income = df['income'].mean()

def higher_income(x):
    return x > mean_income

df['higher_than_avg_income'] = df['income'].map(higher_income)
print(df)


      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income  
0                    True  
1                   False  
2                   False  
3                   False  
4                    True  


In [7]:
# Mapping in an Anonymous Function
mean_income = df['income'].mean()
df['higher_than_avg_income'] = df['income'].map(lambda x: x > mean_income)
print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income  
0                    True  
1                   False  
2                   False  
3                   False  
4                    True  


In [8]:
# Mapping in a Series
last_names = pd.Series(['Doe', 'Miller', 'Edwards', 'Nelson', 'Raul'], index=df['name'])
df['Last Name'] = df['name'].map(last_names)

print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income Last Name  
0                    True       Doe  
1                   False    Miller  
2                   False   Edwards  
3                   False    Nelson  
4                    True      Raul  


In [9]:
# Applying a function to an entire dataframe
def interview(row):
    return row['age'] < 45 and row['income'] > 75000

df['interview'] = df.apply(interview, axis=1)
print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income Last Name  interview  
0                    True       Doe       True  
1                   False    Miller       True  
2                   False   Edwards      False  
3                   False    Nelson      False  
4                    True      Raul       True  


In [10]:
# Passing in arguments into an .apply method
def bonus(row, amount, give=False):
    if give:
        return row['income'] / row['age'] * amount
    else:
        return 0

df['bonus'] = df.apply(bonus, args = (0.25,), give = True, axis=1)
print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income Last Name  interview       bonus  
0                    True       Doe       True  833.333333  
1                   False    Miller       True  500.000000  
2                   False   Edwards      False  429.687500  
3                   False    Nelson      False  231.343284  
4                    True      Raul       True  697.674419  


Exercise 1.3C1

Question 1: Solution

In [11]:
df['percent'] = df['score'].map(lambda x: int(x.replace('%', '')))
print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income Last Name  interview       bonus  percent  
0                    True       Doe       True  833.333333       90  
1                   False    Miller       True  500.000000       95  
2                   False   Edwards      False  429.687500      100  
3                   False    Nelson      False  231.343284       82  
4                    True      Raul       True  697.674419       87  


Question 2: Solution

In [12]:
total_income = df['income'].sum()
df['perc_of_total'] = df['income'] / total_income

print(df)

      name  age score  age_missing_data  income  gender  \
0    James   30   90%              30.0  100000    Male   
1     Jane   40   95%              40.0   80000  Female   
2  Melissa   32  100%              32.0   55000  Female   
3       Ed   67   82%              67.0   62000    Male   
4     Neil   43   87%               NaN  120000    Male   

   higher_than_avg_income Last Name  interview       bonus  percent  \
0                    True       Doe       True  833.333333       90   
1                   False    Miller       True  500.000000       95   
2                   False   Edwards      False  429.687500      100   
3                   False    Nelson      False  231.343284       82   
4                    True      Raul       True  697.674419       87   

   perc_of_total  
0       0.239808  
1       0.191847  
2       0.131894  
3       0.148681  
4       0.287770  
