In [43]:
import pandas as pd
import numpy as np


In [45]:
df = pd.read_csv('workshops.csv')
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings
0,2021.0,,,,,
1,,June,,,,
2,,,1.0,3.0,gRPC in Go,"$33,019"
3,,,7.0,10.0,Optimizing Python,"$42,238"
4,,,28.0,30.0,python Foundations,"$24,372"
5,,July,,,,
6,,,5.0,8.0,go concurrency,"$46,382"
7,,,21.0,22.0,Writing Secure Go,"$27,038"


### Fill Year & Month
"""
Fix the data frame. At the end, row should have the following columns:
- start: pd.Timestemap
- end: pd.Timestamp
- name: str
- topic: str (python or go)
- earnings: np.float64
"""

In [27]:
df['Year'].fillna(method='ffill', inplace=True)
df['Month'].fillna(method='ffill', inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Year'].fillna(method='ffill', inplace=True)
  df['Year'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Month'].fillna(method='ffill', inplace=True)
  df['Month'].fillna(method='ffill', inplace=True)


Unnamed: 0,Year,Month,Start,End,Name,Earnings
0,2021.0,,,,,
1,2021.0,June,,,,
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019"
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238"
4,2021.0,June,28.0,30.0,python Foundations,"$24,372"
5,2021.0,July,,,,
6,2021.0,July,5.0,8.0,go concurrency,"$46,382"
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038"


In [29]:
df = df[pd.notnull(df['Earnings'])].copy()
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019"
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238"
4,2021.0,June,28.0,30.0,python Foundations,"$24,372"
6,2021.0,July,5.0,8.0,go concurrency,"$46,382"
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038"


In [31]:
def as_date(row, col):
    year = int(row['Year'])
    month = row['Month']
    day = int(row[col])
    ts = f'{month} {day}, {year}'
    return pd.to_datetime(ts, format='%B %d, %Y')


In [33]:
df['start'] = df.apply(as_date, axis=1, args=('Start',))
df['end'] = df.apply(as_date, axis=1, args=('End',))
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings,start,end
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019",2021-06-01,2021-06-03
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238",2021-06-07,2021-06-10
4,2021.0,June,28.0,30.0,python Foundations,"$24,372",2021-06-28,2021-06-30
6,2021.0,July,5.0,8.0,go concurrency,"$46,382",2021-07-05,2021-07-08
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038",2021-07-21,2021-07-22


In [37]:
# Extract topic
def topic(name):
    if 'go' in name:
        return 'go'
    if 'python' in name:
        return 'python'

In [39]:
df['topic'] = df['Name'].str.lower().apply(topic)
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings,start,end,topic
2,2021.0,June,1.0,3.0,gRPC in Go,"$33,019",2021-06-01,2021-06-03,go
3,2021.0,June,7.0,10.0,Optimizing Python,"$42,238",2021-06-07,2021-06-10,python
4,2021.0,June,28.0,30.0,python Foundations,"$24,372",2021-06-28,2021-06-30,python
6,2021.0,July,5.0,8.0,go concurrency,"$46,382",2021-07-05,2021-07-08,go
7,2021.0,July,21.0,22.0,Writing Secure Go,"$27,038",2021-07-21,2021-07-22,go


In [49]:
# df['earnings'] = pd.to_numeric(
#     df['Earnings'].str.replace(r'[$,]', '')
# ).astype(np.float64)
# df

# Remove '$' and ',' from the 'Earnings' column and then convert to numeric
df['earnings'] = pd.to_numeric(
    df['Earnings'].str.replace(r'[$,]', '', regex=True)  # Remove '$' and ','
).astype(np.float64)  # Convert the cleaned string to float
df

Unnamed: 0,Year,Month,Start,End,Name,Earnings,earnings
0,2021.0,,,,,,
1,,June,,,,,
2,,,1.0,3.0,gRPC in Go,"$33,019",33019.0
3,,,7.0,10.0,Optimizing Python,"$42,238",42238.0
4,,,28.0,30.0,python Foundations,"$24,372",24372.0
5,,July,,,,,
6,,,5.0,8.0,go concurrency,"$46,382",46382.0
7,,,21.0,22.0,Writing Secure Go,"$27,038",27038.0


In [51]:
# Cleanup
df = df[['start', 'end', 'Name', 'topic', 'earnings']]

KeyError: "['start', 'end', 'topic'] not in index"

In [None]:
df.rename(columns={'Name': 'name'}, inplace=True)
df