In [12]:
import pandas as pd
from datetime import datetime

In [2]:
df_sp = pd.DataFrame([["Paul",50],["Stacy",100],["Paul",80]], columns=["Salesperson","Sales Total"])
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


# 1. Columns - adding a new Column

In [4]:
# Using df_sp (Salesperson sales totals from above)
df_sp['Example Column'] = 10
df_sp

Unnamed: 0,Salesperson,Sales Total,Example Column
0,Paul,50,10
1,Stacy,100,10
2,Paul,80,10


# 2. Columns - Deleting a column

In [5]:
del df_sp['Example Column']
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


# 3. Columns - create a new column from a sum

In [6]:
df_sp['Sales Quantity'] = df_sp['Sales Total'] / 5
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity
0,Paul,50,10.0
1,Stacy,100,20.0
2,Paul,80,16.0


# 4. Columns - create a new column with the APPLY() method
- Here we create a new column based on applying a function to existing column data.

In [7]:
def divide_by_10(sales_total):
    return sales_total / 10
df_sp['New Sales Quantity'] = df_sp['Sales Total'].apply(divide_by_10)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10.0,5.0
1,Stacy,100,20.0,10.0
2,Paul,80,16.0,8.0


# 5. Columns - change data type - FLOAT to INTEGER
- We have some columns containing floats and some containing ints after previous column creation with calculations.
- In this case, we know the sum result will be an int and we want to convert from float to an int

In [8]:
df_sp.dtypes

Salesperson            object
Sales Total             int64
Sales Quantity        float64
New Sales Quantity    float64
dtype: object

In [9]:
df_sp['Sales Quantity'] = df_sp['Sales Quantity'].astype(int)
df_sp['New Sales Quantity'] = df_sp['New Sales Quantity'].astype(int)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10,5
1,Stacy,100,20,10
2,Paul,80,16,8


# 6. Columns - create a new string column based on logic in APPLY() method

- Here we create a new string column based on sales performance.

In [10]:
def good_bad(sales_total):
    try:
        if sales_total > 60:
            return 'Good'
        else:
            return 'Bad'
    except:
        return 'N/A'
    
df_sp['Performance'] = df_sp['Sales Total'].apply(good_bad)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance
0,Paul,50,10,5,Bad
1,Stacy,100,20,10,Good
2,Paul,80,16,8,Good


# 7. Columns and Dates - create a year and month column from a date

In [13]:
df_sp['Date'] = datetime(2022,1,7)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance,Date
0,Paul,50,10,5,Bad,2022-01-07
1,Stacy,100,20,10,Good,2022-01-07
2,Paul,80,16,8,Good,2022-01-07


# 8. Columns - change the names of your columns
- Often, if you've pulled the data from somewhere else to create a dataframe you might want to rename your columns.

In [14]:
df_change_cols = pd.DataFrame([["Steve",100],["Jenny",120]], columns=["A","B"])
df_change_cols

Unnamed: 0,A,B
0,Steve,100
1,Jenny,120


In [15]:
df_change_cols.columns = ["Name", "Test Score"]
df_change_cols

Unnamed: 0,Name,Test Score
0,Steve,100
1,Jenny,120


# 9. Data Frame - use the APPLY() method with a LAMBDA function on an entire dataframe

- It's more common to use the apply method on individual columns but you can apply functions to an entire dataframe.
- NOTE! This doesn't affect the existing dataframe - so you must ASSIGN to a dataframe.

In [17]:
df_nums = pd.DataFrame([[1,2],[3,4]], columns=['A','B'])
print(df_nums)
df_nums = df_nums.apply(lambda x: x + 5)
df_nums

   A  B
0  1  2
1  3  4


Unnamed: 0,A,B
0,6,7
1,8,9


# 10. NaN - use ISNULL() to find null values

In [18]:
import numpy as np
df_find_nans = pd.DataFrame({"test_col":['$26 M','$32 B',np.nan,0]})
df_find_nans

Unnamed: 0,test_col
0,$26 M
1,$32 B
2,
3,0


In [19]:
# Find null values in a column
df_find_nans[df_find_nans['test_col'].isnull()]

Unnamed: 0,test_col
2,


# 11. NaN - use na=False to find NaN AND non-strings
- Here we want to test all strings contain a dollar sign plus M or B.
- We have to set nan=false in the str.contains method to return NaN and any non strings.

In [20]:
df_find_nans[df_find_nans['test_col'].str.contains('$', na=False) == False]

Unnamed: 0,test_col
2,
3,0.0


# 12. NaN - convert missing numerical data to 0 with FILLNA()
- NaN means missing data
- Any blank cells from a spreadsheet will be returned as NaN

In [21]:
df_nans = pd.DataFrame({
    'employee':['Bob Jenkins','Jane Willis','Sally Turner','William Jones'],
    'department':['Marketing','HR','IT','Marketing'],
    'salary':[39000,48000,None,39000]
})
df_nans

Unnamed: 0,employee,department,salary
0,Bob Jenkins,Marketing,39000.0
1,Jane Willis,HR,48000.0
2,Sally Turner,IT,
3,William Jones,Marketing,39000.0


In [23]:
# Replace all NaNs with 30000
df_nans['salary'] = df_nans['salary'].fillna(30000)
df_nans

Unnamed: 0,employee,department,salary
0,Bob Jenkins,Marketing,39000.0
1,Jane Willis,HR,48000.0
2,Sally Turner,IT,30000.0
3,William Jones,Marketing,39000.0
