<a href="https://colab.research.google.com/github/prof-rossetti/intro-to-python/blob/main/units/msfo-833/Class_2_Exercises_(MSFO_833_Spring_2023)_SOLUTIONS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Imports

In [None]:
# RUN THIS CELL AND FEEL FREE TO MODIFY / UPDATE AS DESIRED

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt


## Helper Function (USD Formatting)



The function below can be used to convert a number into a dollar-sign formatted string. 

Run the cell, and feel free to use / invoke this function later as desired.

In [None]:
#
# SETUP CELL (RUN THIS CELL, AND DO NOT MODIFY)
#

def to_usd(my_price):
    """
        Converts a numeric value to USD-formatted string, for printing and display purposes.
        Adds dollar sign and commas for the thousands separator.
        Rounds to two decimal places. 
        
        Param: my_price (int or float or str) like 4000.444444 or "4000.444444"
        
        Example: to_usd(4000.444444)
        
        Returns: $4,000.44
    """
    return f"${float(my_price):,.2f}" 


In [None]:
# example invocations:
print(to_usd(4.5))
print(to_usd(1234567890.12345))

$4.50
$1,234,567,890.12


## Helper Function (PCT Formatting)

In [None]:
#
# SETUP CELL (RUN THIS CELL, AND DO NOT MODIFY)
#


def to_pct(my_number):
    """
        Formats a decimal number as a percentage, rounded to 2 decimal places, with a percent sign.
        
        Param my_number (float) like 0.95555555555
        
        Returns (str) like '95.56%'
    """
    return f"{(my_number * 100):.2f}%"




In [None]:
# example invocations:
print(to_pct(0.5))
print(to_pct(.955555555))

50.00%
95.56%


# Exercises

## Exercise 1 (Gradebook Revisited)

Given the provided `gradebook_df` and `students_df` variables representing a student roster and corresponding grades, write Python code to perform each of the following tasks...


A) Load the provided gradebook data and student data.

B) Add a column called "full_name" to the students data, where the full name is a concatenation of the student's first and last name. Remember to put a space character inbetween.


C) Add a column called "letter_grade" to the gradebook data, where the letter grade is a function of the "final score". Use the provided `calculate_grade()` function to calculate pass/fail letter grades.


D) Merge the student roster data with the gradebook data, on the basis of the common "student_id" column in each dataset. NOTE: in the final merged dataset, we should see the student ids, full names, final grades, and letter grades, etc.


E) Create a histogram of the numeric grades, using bins of your choosing.

In [None]:
gradebook_df = pd.read_csv("https://raw.githubusercontent.com/prof-rossetti/intro-to-python/main/data/gradebook.csv")
gradebook_df.head()

Unnamed: 0,student_id,final_grade
0,1,76.7
1,2,85.1
2,3,50.3
3,4,89.8
4,5,97.4


In [None]:
students_df = pd.DataFrame([
    {"student_id": 1, "first_name": "Anita", "last_name": "Johnson"},
    {"student_id": 2, "first_name": "James", "last_name": "Smith"},
    {"student_id": 3, "first_name": "Ankit", "last_name": "Shah"},
    {"student_id": 4, "first_name": "Jia", "last_name": "Yang"},
    {"student_id": 5, "first_name": "Howard", "last_name": "Anderson"},
    {"student_id": 6, "first_name": "George", "last_name": "Washington"},
    {"student_id": 7, "first_name": "Michael", "last_name": "Jordan"},
    {"student_id": 8, "first_name": "Serena", "last_name": "Williams"},
    {"student_id": 9, "first_name": "Maggie", "last_name": "Rogers"},
    {"student_id": 10, "first_name": "Ryan", "last_name": "Seacrest"},
    {"student_id": 11, "first_name": "Derek", "last_name": "Jeter"},
])
students_df.head()

Unnamed: 0,student_id,first_name,last_name
0,1,Anita,Johnson
1,2,James,Smith
2,3,Ankit,Shah
3,4,Jia,Yang
4,5,Howard,Anderson


In [None]:
# students_df.to_csv("students.csv", index=False)

### Column / Series Operations

In [None]:
gradebook_df["final_grade"] / 100.0

0    0.767
1    0.851
2    0.503
3    0.898
4    0.974
5    0.755
6    0.872
7    0.880
8    0.939
9    0.925
Name: final_grade, dtype: float64

In [None]:
students_df["full_name"] = students_df["first_name"] + " " + students_df["last_name"]
students_df.head()

Unnamed: 0,student_id,first_name,last_name,full_name
0,1,Anita,Johnson,Anita Johnson
1,2,James,Smith,James Smith
2,3,Ankit,Shah,Ankit Shah
3,4,Jia,Yang,Jia Yang
4,5,Howard,Anderson,Howard Anderson


### Function Application

In [None]:
def calculate_grade(score):
    if score >= 75.0:
        grade = "Pass"
    else:
        grade = "Fail"
    return grade


gradebook_df["letter_grade"] = gradebook_df["final_grade"].apply(calculate_grade)

gradebook_df.head()


Unnamed: 0,student_id,final_grade,letter_grade
0,1,76.7,Pass
1,2,85.1,Pass
2,3,50.3,Fail
3,4,89.8,Pass
4,5,97.4,Pass


### Merging / Joining Datasets

In [None]:
# merging on normal column(s):

merged_df = students_df.merge(gradebook_df, how="left", 
                        left_on="student_id", right_on="student_id")

merged_df.head()

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
0,1,Anita,Johnson,Anita Johnson,76.7,Pass
1,2,James,Smith,James Smith,85.1,Pass
2,3,Ankit,Shah,Ankit Shah,50.3,Fail
3,4,Jia,Yang,Jia Yang,89.8,Pass
4,5,Howard,Anderson,Howard Anderson,97.4,Pass


In [None]:
## creating index column for good measure
#students_df.index = students_df["student_id"]
#gradebook_df.index = gradebook_df["student_id"]
#
## merging on index column:
#merged_df = students_df.merge(gradebook_df, how="left", 
#                        left_index=True, right_index=True)
#
#merged_df.head() # note the student_id_x and student_id_y extra cols we might need to clean up later

### Filtering Columns

In [None]:
print(merged_df.columns.tolist())

# selecting / viewing just a subset of the columns 
merged_df[["full_name", "final_grade", "letter_grade"]].head()

['student_id', 'first_name', 'last_name', 'full_name', 'final_grade', 'letter_grade']


Unnamed: 0,full_name,final_grade,letter_grade
0,Anita Johnson,76.7,Pass
1,James Smith,85.1,Pass
2,Ankit Shah,50.3,Fail
3,Jia Yang,89.8,Pass
4,Howard Anderson,97.4,Pass


### Filtering Rows

In [None]:
# selecting / viewing just a subset of the columns 

merged_df[merged_df["letter_grade"] == "Fail"]

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
2,3,Ankit,Shah,Ankit Shah,50.3,Fail


In [None]:
# selecting / viewing just a subset of the columns 

merged_df[merged_df["final_grade"] >= 90.0]

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
4,5,Howard,Anderson,Howard Anderson,97.4,Pass
8,9,Maggie,Rogers,Maggie Rogers,93.9,Pass
9,10,Ryan,Seacrest,Ryan Seacrest,92.5,Pass


In [None]:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.between.html

merged_df[merged_df["final_grade"].between(90, 95)]

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
8,9,Maggie,Rogers,Maggie Rogers,93.9,Pass
9,10,Ryan,Seacrest,Ryan Seacrest,92.5,Pass


In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html

selected_last_names = ["Jordan", "Rogers"]
merged_df[merged_df["last_name"].isin(selected_last_names)]

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
6,7,Michael,Jordan,Michael Jordan,87.2,Pass
8,9,Maggie,Rogers,Maggie Rogers,93.9,Pass


In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html

selected_ids = [1,5,7]
# merged_df[merged_df["student_id"].isin(selected_ids)] # our ids are in the index though right now...
merged_df[merged_df.index.isin(selected_ids)]

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
1,2,James,Smith,James Smith,85.1,Pass
5,6,George,Washington,George Washington,75.5,Pass
7,8,Serena,Williams,Serena Williams,88.0,Pass


### Dropping Columns

In [None]:
print(merged_df.columns.tolist())

merged_df["other_col"] = 5 # just making a column do we can drop later

print(merged_df.columns.tolist())

merged_df.drop(columns=["student_id_x", "student_id_y", "other_col"], inplace=True, errors="ignore")

print(merged_df.columns.tolist())

merged_df.head()

['student_id', 'first_name', 'last_name', 'full_name', 'final_grade', 'letter_grade']
['student_id', 'first_name', 'last_name', 'full_name', 'final_grade', 'letter_grade', 'other_col']
['student_id', 'first_name', 'last_name', 'full_name', 'final_grade', 'letter_grade']


Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
0,1,Anita,Johnson,Anita Johnson,76.7,Pass
1,2,James,Smith,James Smith,85.1,Pass
2,3,Ankit,Shah,Ankit Shah,50.3,Fail
3,4,Jia,Yang,Jia Yang,89.8,Pass
4,5,Howard,Anderson,Howard Anderson,97.4,Pass


### Sorting Rows

In [None]:
merged_df.sort_values(by=["final_grade", "last_name"], ascending=False, inplace=True)
merged_df.head()

Unnamed: 0,student_id,first_name,last_name,full_name,final_grade,letter_grade
4,5,Howard,Anderson,Howard Anderson,97.4,Pass
8,9,Maggie,Rogers,Maggie Rogers,93.9,Pass
9,10,Ryan,Seacrest,Ryan Seacrest,92.5,Pass
3,4,Jia,Yang,Jia Yang,89.8,Pass
7,8,Serena,Williams,Serena Williams,88.0,Pass


### Dataviz (Histogram)

In [None]:
# https://plotly.github.io/plotly.py-docs/generated/plotly.express.histogram.html
fig = px.histogram(x=merged_df["final_grade"], nbins=10)
fig.show()

## Exercise 2 (Monthly Sales)


Given the provided `sales_df` variable representing monthly retail sales, write Python code to read the CSV data and perform each of the following tasks...


A) What is the **structure** of this data. In other words, we have a "row per what?" Describe your answer in words. 


B) How many **unique products** are sold (i.e. `7`), and what are their names? Print the list of unique products in alphabetical order (i.e. `['Baseball Cap', 'Brown Boots', 'Button-Down Shirt', 'Khaki Pants', 'Sticker Pack', 'Super Soft Hoodie', 'Vintage Logo Tee']`). Also print the unique dates, and count how many unique dates there are.

C) Print the **total monthly sales**, formatted as USD (i.e. `"$12,000.71"`).


D) Calculate the **total sales for each day**, and create a bar or line chart depicting the sales over time. Optionally  also print which five dates have the most sales.

> ![](https://user-images.githubusercontent.com/1328807/211162483-1418bd1c-7e43-42bc-b2c4-b5c24e242f0a.png)


E) Determine the **total sales for each product**, and create a horizontal bar chart to show the top selling products, with the bars sorted in descending order of their length.


> ![](https://user-images.githubusercontent.com/1328807/211162481-07593a51-57f9-4bfd-ab14-f0878d8bc960.png)


F) Create another column in the dataset that represents the day of the week (i.e. Monday, Tuesday, etc.). Group the data by day of week and aggregate the sum of **total sales on each day of week**. Do there appear to be any cyclical trends / weekly patterns? Which days of week have the highest sales on average? Which days of the week have lowest sales on average?

> NOTE: we will revisit seasonality analysis in more detail next unit!



In [None]:
month = "201803"
sales_df = pd.read_csv(f"https://raw.githubusercontent.com/prof-rossetti/data-analytics-in-python/main/data/unit-2/monthly-sales/sales-{month}.csv")
sales_df.head()

Unnamed: 0,date,product,unit price,units sold,sales price
0,2018-03-01,Button-Down Shirt,65.05,2,130.1
1,2018-03-01,Vintage Logo Tee,15.95,1,15.95
2,2018-03-01,Sticker Pack,4.5,1,4.5
3,2018-03-02,Super Soft Hoodie,75.0,2,150.0
4,2018-03-02,Button-Down Shirt,65.05,7,455.35


In [None]:
print("TOTAL SALES:", to_usd(sales_df["sales price"].sum()))

TOTAL SALES: $12,000.71


In [None]:
days = sales_df["date"].unique()

print(len(days))
print(days)

31
['2018-03-01' '2018-03-02' '2018-03-03' '2018-03-04' '2018-03-05'
 '2018-03-06' '2018-03-07' '2018-03-08' '2018-03-09' '2018-03-10'
 '2018-03-11' '2018-03-12' '2018-03-13' '2018-03-14' '2018-03-15'
 '2018-03-16' '2018-03-17' '2018-03-18' '2018-03-19' '2018-03-20'
 '2018-03-21' '2018-03-22' '2018-03-23' '2018-03-24' '2018-03-25'
 '2018-03-26' '2018-03-27' '2018-03-28' '2018-03-29' '2018-03-30'
 '2018-03-31']


### Grouping and Aggregation

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

sales_by_product = sales_df.groupby("product")["sales price"].sum()
print(type(sales_by_product))
sales_by_product

<class 'pandas.core.series.Series'>


product
Baseball Cap          156.31
Brown Boots           250.00
Button-Down Shirt    6960.35
Khaki Pants          1602.00
Sticker Pack          216.00
Super Soft Hoodie    1875.00
Vintage Logo Tee      941.05
Name: sales price, dtype: float64

In [None]:
# we want a row per date, and to sum the sales price

sales_by_date = sales_df.groupby("date")["sales price"].sum()
print(type(sales_by_date))
sales_by_date

<class 'pandas.core.series.Series'>


date
2018-03-01     150.55
2018-03-02     675.53
2018-03-03     902.63
2018-03-04     209.28
2018-03-05     177.50
2018-03-06     117.40
2018-03-07     371.60
2018-03-08      69.55
2018-03-09     340.38
2018-03-10     765.20
2018-03-11    1067.48
2018-03-12     383.05
2018-03-13     252.00
2018-03-14     449.15
2018-03-15      24.95
2018-03-16     290.90
2018-03-17     641.18
2018-03-18     739.25
2018-03-19     199.45
2018-03-20     327.00
2018-03-21     306.55
2018-03-22     215.60
2018-03-23     495.70
2018-03-24     610.40
2018-03-25     259.40
2018-03-26     228.10
2018-03-27     295.10
2018-03-28     117.40
2018-03-29     298.60
2018-03-30     510.33
2018-03-31     509.50
Name: sales price, dtype: float64

### Pivot Tables

In [None]:
# we want a row per date, and to sum the sales price
dates_pivot = pd.pivot_table(sales_df, index=["date"],
                          values=["sales price"],
                          aggfunc={"sales price": np.sum}
                        )

print(type(dates_pivot))
dates_pivot.rename(columns={"sales price": "sales_total"}, inplace=True)
dates_pivot.sort_values(by=["sales_total"], ascending=False, inplace=True)
dates_pivot.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,sales_total
date,Unnamed: 1_level_1
2018-03-11,1067.48
2018-03-03,902.63
2018-03-10,765.2
2018-03-18,739.25
2018-03-02,675.53


In [None]:
# we want a row per date, and to sum the sales price
# let's sum the units sold for good measure as well!
#dates_pivot = pd.pivot_table(sales_df, index=["date"],
#                          values=["sales price", "units sold"],
#                          aggfunc={"sales price": np.sum,
#                                   "units sold": np.sum
#                                   } # designate the agg function to be used for each original column. can use our own custom functions here as well
#                        )
#
#print(type(dates_pivot))
#dates_pivot.rename(columns={"sales price": "sales_total", "units sold": "units_sold"}, inplace=True)
#dates_pivot.sort_values(by=["sales_total"], ascending=False, inplace=True)
#dates_pivot.head()

In [None]:
# we want a row per date, and to sum the sales price
# let's sum the units sold for good measure as well!
products_pivot = pd.pivot_table(sales_df, index=["product"],
                          values=["sales price", "units sold"],
                          aggfunc={"sales price": np.sum,
                                   "units sold": np.sum
                                   }
                        )

print(type(products_pivot))
products_pivot.rename(columns={"sales price": "sales_total", "units sold": "units_sold"}, inplace=True)
products_pivot.sort_values(by=["sales_total"], ascending=False, inplace=True)
products_pivot.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,sales_total,units_sold
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Button-Down Shirt,6960.35,107
Super Soft Hoodie,1875.0,25
Khaki Pants,1602.0,18
Vintage Logo Tee,941.05,59
Brown Boots,250.0,2


### Dataviz (Bar)

In [None]:

chart_df = dates_pivot.copy()
chart_df.sort_values(by=["date"], ascending=True, inplace=True)
chart_df["date"] = chart_df.index # adding this as a separate column, for charting purposes

px.bar(chart_df, x="date", y="sales_total", title="Sales by Day (March 2018)")


### Dataviz (Horizontal Bar)

In [None]:

chart_df = products_pivot.copy()
chart_df["product"] = chart_df.index
#px.bar(chart_df,  x="product", y="sales_total")

# sorting inverse order to get the bars to show up in the right order when charting
chart_df.sort_values(by=["sales_total"], ascending=True, inplace=True)
px.bar(chart_df,  y="product", x="sales_total", orientation="h", title="Top Selling Products (March 2018)")


### Aggregations for Basic Cycle / Seasonality Analysis

In [None]:
sales_df.dtypes

date            object
product         object
unit price     float64
units sold       int64
sales price    float64
dtype: object

In [None]:
sales_df["date"] = pd.to_datetime(sales_df["date"])
sales_df.dtypes

date           datetime64[ns]
product                object
unit price            float64
units sold              int64
sales price           float64
dtype: object

In [None]:
# https://docs.python.org/3/library/datetime.html
# https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
# https://docs.python.org/3/library/datetime.html#datetime.datetime.weekday

from datetime import datetime

example_dt = datetime(2030,12,31)
print(type(example_dt))
print(example_dt)

<class 'datetime.datetime'>
2030-12-31 00:00:00


In [None]:
print(example_dt.year)
print(example_dt.month)
print(example_dt.day)
print(example_dt.weekday())
print(example_dt.date())
print(example_dt.strftime("%Y-%m-%d"))

2030
12
31
1
2030-12-31
2030-12-31


In [None]:
def get_year(my_dt):
    return my_dt.year

def get_month(my_dt):
    return my_dt.month

def get_weekday(my_dt):
    """Returns the numeric weekday (0 is Monday)"""
    return my_dt.weekday()


sales_df["year"] = sales_df["date"].apply(get_year)
sales_df["month"] = sales_df["date"].apply(get_month)
sales_df["weekday"] = sales_df["date"].apply(get_weekday)

sales_df[["date", "year", "month", "weekday"]]

Unnamed: 0,date,year,month,weekday
0,2018-03-01,2018,3,3
1,2018-03-01,2018,3,3
2,2018-03-01,2018,3,3
3,2018-03-02,2018,3,4
4,2018-03-02,2018,3,4
...,...,...,...,...
112,2018-03-31,2018,3,5
113,2018-03-31,2018,3,5
114,2018-03-31,2018,3,5
115,2018-03-31,2018,3,5


In [None]:
# alternative approach:

sales_df["year"] = sales_df["date"].dt.year
sales_df["month"] = sales_df["date"].dt.month
sales_df["weekday"] = sales_df["date"].dt.weekday

sales_df[["date", "year", "month", "weekday"]]

Unnamed: 0,date,year,month,weekday
0,2018-03-01,2018,3,3
1,2018-03-01,2018,3,3
2,2018-03-01,2018,3,3
3,2018-03-02,2018,3,4
4,2018-03-02,2018,3,4
...,...,...,...,...
112,2018-03-31,2018,3,5
113,2018-03-31,2018,3,5
114,2018-03-31,2018,3,5
115,2018-03-31,2018,3,5


In [None]:
sales_df.groupby("weekday")["sales price"].sum()

weekday
0     988.10
1     991.50
2    1244.70
3     759.25
4    2312.84
5    3428.91
6    2275.41
Name: sales price, dtype: float64

In [None]:
sales_df.groupby("weekday")["sales price"].mean()

weekday
0     76.007692
1     70.821429
2     77.793750
3     54.232143
4    115.642000
5    155.859545
6    126.411667
Name: sales price, dtype: float64

## Exercise 3 (Example GDP Data)

In [None]:
example_df = pd.DataFrame([
    {"year": 1990, "gdp": 100},
    {"year": 1991, "gdp": 105},
    {"year": 1992, "gdp": 110},
    {"year": 1993, "gdp": 115},
    {"year": 1994, "gdp": 110}

])
example_df.head()

Unnamed: 0,year,gdp
0,1990,100
1,1991,105
2,1992,110
3,1993,115
4,1994,110


### Shift, Growth, Percent Change

In [None]:
# it is already sorted, but here we are sorting by year for good measure
example_df.sort_values(by=["year"], ascending=True, inplace=True)
example_df.head()

Unnamed: 0,year,gdp
0,1990,100
1,1991,105
2,1992,110
3,1993,115
4,1994,110


In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html

example_df["gdp"].shift(periods=1) # 1 or -1 depending on order

0      NaN
1    100.0
2    105.0
3    110.0
4    115.0
Name: gdp, dtype: float64

In [None]:
# our own calculation of growth / percent change, using current and previous values:
example_df["gdp_prev"] = example_df["gdp"].shift(periods=1)
example_df["gdp_growth"] = (example_df["gdp"] - example_df["gdp_prev"]) / example_df["gdp_prev"] 

example_df[["year", "gdp", "gdp_growth"]]

Unnamed: 0,year,gdp,gdp_growth
0,1990,100,
1,1991,105,0.05
2,1992,110,0.047619
3,1993,115,0.045455
4,1994,110,-0.043478


In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.pct_change.html

# equivalent, leveraging the pct_change method:
example_df["gdp_growth"] = example_df["gdp"].pct_change(periods=1)

example_df[["year", "gdp", "gdp_growth"]]

Unnamed: 0,year,gdp,gdp_growth
0,1990,100,
1,1991,105,0.05
2,1992,110,0.047619
3,1993,115,0.045455
4,1994,110,-0.043478


With nice percent sign formatting, as requested by students in Wednesday night's class:

In [None]:
example_df["gdp_growth_pct"] = example_df["gdp_growth"].apply(to_pct)
example_df

Unnamed: 0,year,gdp,gdp_prev,gdp_growth,gdp_growth_pct
0,1990,100,,,nan%
1,1991,105,100.0,0.05,5.00%
2,1992,110,105.0,0.047619,4.76%
3,1993,115,110.0,0.045455,4.55%
4,1994,110,115.0,-0.043478,-4.35%


### Cumulative Growth

This approach uses the `cumprod()` / cumulative product function, but FYI the assignment will ask you to use the `product()` function to calculate cumulative growth for a particular period.

In [None]:
example_df["gdp_growth"] + 1 

0         NaN
1    1.050000
2    1.047619
3    1.045455
4    0.956522
Name: gdp_growth, dtype: float64

In [None]:
# loc[row_index, column_name]
example_df.loc[0, "gdp_growth"]

nan

In [None]:
# overwriting initial NaN value to make our math work later
example_df.loc[0, "gdp_growth"] = 0

example_df

Unnamed: 0,year,gdp,gdp_prev,gdp_growth
0,1990,100,,0.0
1,1991,105,100.0,0.05
2,1992,110,105.0,0.047619
3,1993,115,110.0,0.045455
4,1994,110,115.0,-0.043478


In [None]:
example_df["gdp_growth"] + 1

0    1.000000
1    1.050000
2    1.047619
3    1.045455
4    0.956522
Name: gdp_growth, dtype: float64

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.cumprod.html
# see also: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.product.html

example_df["cumulative_growth"] = (example_df["gdp_growth"] + 1).cumprod()

example_df

Unnamed: 0,year,gdp,gdp_prev,gdp_growth,cumulative_growth
0,1990,100,,0.0,1.0
1,1991,105,100.0,0.05,1.05
2,1992,110,105.0,0.047619,1.1
3,1993,115,110.0,0.045455,1.15
4,1994,110,115.0,-0.043478,1.1


## Exercise 4 (Realtime Stock Data)

In [None]:
#
# SETUP CELL (RUN AND DO NOT MODIFY)
#

from getpass import getpass

API_KEY = getpass("Please input your Alphavantage API KEY: ") 

Please input your Alphavantage API KEY: ··········


In [None]:
import pandas as pd
import plotly.express as px

def fetch_stock_data(symbol="NFLX"):
    print("---------------------")
    print("FETCHING DATA...")
    print("SYMBOL:", symbol)
    
    request_url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey={API_KEY}&datatype=csv&outputsize=full"
    df = pd.read_csv(request_url)
    
    fig = px.line(df, x="timestamp", y="adjusted_close", title=f"Adjusted Closing Price ({symbol})")
    fig.show()

    return df


In [None]:
nflx_df = fetch_stock_data()
print(nflx_df.head())

---------------------
FETCHING DATA...
SYMBOL: NFLX


    timestamp    open    high      low   close  adjusted_close    volume  \
0  2023-01-11  326.50  328.88  321.350  327.26          327.26   9562739   
1  2023-01-10  311.07  329.35  311.070  327.54          327.54  13072286   
2  2023-01-09  316.83  321.70  313.220  315.17          315.17   6766628   
3  2023-01-06  311.57  316.77  303.690  315.55          315.55   8959839   
4  2023-01-05  307.00  314.18  304.545  309.70          309.70   8328390   

   dividend_amount  split_coefficient  
0              0.0                1.0  
1              0.0                1.0  
2              0.0                1.0  
3              0.0                1.0  
4              0.0                1.0  


### Dataviz (Line)

In [None]:
px.line(nflx_df, x="timestamp", y="adjusted_close", title="NFLX Adjusted Close")

### Dataviz (Scatter w/ Trendlines)

In [None]:
#px.scatter(nflx_df, x="timestamp", y="adjusted_close", title="NFLX Adjusted Close",
#    trendline="ols"
#)

In [None]:
nflx_df.dtypes

timestamp             object
open                 float64
high                 float64
low                  float64
close                float64
adjusted_close       float64
volume                 int64
dividend_amount      float64
split_coefficient    float64
dtype: object

In [None]:
nflx_df["timestamp"] = pd.to_datetime(nflx_df["timestamp"])
nflx_df.dtypes

timestamp            datetime64[ns]
open                        float64
high                        float64
low                         float64
close                       float64
adjusted_close              float64
volume                        int64
dividend_amount             float64
split_coefficient           float64
dtype: object

In [None]:
px.scatter(nflx_df, x="timestamp", y="adjusted_close", title="NFLX Adjusted Close",
    trendline="ols", trendline_color_override="red",
)

In [None]:
px.scatter(nflx_df, x="timestamp", y="adjusted_close", title="NFLX Adjusted Close",
    trendline="lowess", trendline_color_override="red",
)

### Moving Averages

In [None]:
# we must remember to sort before using a shift-based method
nflx_df.sort_values(by=["timestamp"], ascending=True, inplace=True)

# moving average:
# ... https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html#pandas.DataFrame.rolling
nflx_df['ma_50'] = nflx_df['adjusted_close'].rolling(window=50).mean()

# exponential moving average:
# ... https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
nflx_df['ema_50'] = nflx_df['adjusted_close'].ewm(span=50, min_periods=0, adjust=False, ignore_na=False).mean()

nflx_df[["timestamp", "adjusted_close", "ma_50", "ema_50"]]

Unnamed: 0,timestamp,adjusted_close,ma_50,ema_50
5195,2002-05-23,1.196427,,1.196427
5194,2002-05-24,1.209999,,1.196960
5193,2002-05-28,1.157142,,1.195398
5192,2002-05-29,1.103570,,1.191797
5191,2002-05-30,1.071427,,1.187077
...,...,...,...,...
4,2023-01-05,309.700000,293.5547,288.423142
3,2023-01-06,315.550000,294.0453,289.486940
2,2023-01-09,315.170000,294.3763,290.494119
1,2023-01-10,327.540000,294.9883,291.946899


In [None]:
chart_df = nflx_df[nflx_df["timestamp"] >= "2019-01-01"]

px.line(chart_df, x="timestamp", y=["adjusted_close", "ma_50", "ema_50"], 
        title="NFLX Adjusted Close w/ Moving Averages"
)