In [1]:
import pandas as pd
from datetime import datetime

# Data Frame - Creation with a Dictionary
When using a dictionary to create a dataframe, the key becomes a column name. The array assigned to each key is the column data.

In [2]:
european_cities = {
    "cities":["Istanbul","Moscow","Paris","London","Madrid"],
    "population":[15000000,12000000,11000000,9000000,6000000]
}
df = pd.DataFrame(european_cities)
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Date Frame - Creation from a list of lists

In [3]:
european_cities_list = [
    ["Istanbul",15000000],
    ["Moscow",12000000],
    ["Paris",11000000],
    ["London",9000000],
    ["Madrid",6000000]
]
df = pd.DataFrame(european_cities_list, columns=["cities","population"])
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Data Frame - Creation from a list of lists - with unique Index
You can setup the dataframe to have a unique index instead of an auto incremented number.

In [4]:
df_unq_index = pd.DataFrame(
    [[15000000],[12000000],[11000000]], 
    index=['Istanbul','Moscow','Paris'], columns=['population'])
df_unq_index

Unnamed: 0,population
Istanbul,15000000
Moscow,12000000
Paris,11000000


# Data Frame - Creation from a CSV or Excel spreadsheet

- See below for more options working with Excel using openpyxl.
- See 'Drop an Index from a dataframe when exporting' to drop an index when exporting to CSV or XLSX files.

In [5]:
# Save the population dataframe to a csv
df.to_csv('populations.csv')
# OPEN from a csv file
df = pd.read_csv('populations.csv')
df

Unnamed: 0.1,Unnamed: 0,cities,population
0,0,Istanbul,15000000
1,1,Moscow,12000000
2,2,Paris,11000000
3,3,London,9000000
4,4,Madrid,6000000


# Data Types - exploring column data types

- Strings are classed as 'object'

In [6]:
df_food = pd.DataFrame(
    [["Chicken",3,14.5,False,datetime(2022,2,15)],["Carrots",2,23.5,True,datetime(2022,2,15)],["Bread",1,18,True,datetime(2022,2,15)]], 
    columns=["Product","Price","Discount","In stock","Next delivery date"]
)
df_food.dtypes

Product                       object
Price                          int64
Discount                     float64
In stock                        bool
Next delivery date    datetime64[ns]
dtype: object

# Columns - Remove an unnecessary column

- See 'Index' below for how to drop an index when exporting to csv or xlsx files.

In [7]:
del df['Unnamed: 0']
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Sort - apply a sort to existing dataframe

In [8]:
df.sort_values(by=["population"], ascending=True, inplace=True)
df

Unnamed: 0,cities,population
4,Madrid,6000000
3,London,9000000
2,Paris,11000000
1,Moscow,12000000
0,Istanbul,15000000


# Index - reset index numbers after a sort (or any function)

In [9]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000


# Index - Drop an Index from a dataframe when exporting
- Before saving a dataframe to another format such as an Excel spreadsheet, you may want to remove the index from the data frame.

In [10]:
df.to_csv('populations2.csv', index=False)

# Find a specific row - based on a search string

We want to find a row/ rows where the cities column is "Paris".
- This will return a DataFrame

In [11]:
df[df["cities"] == "Paris"]

Unnamed: 0,cities,population
2,Paris,11000000


# Find rows based on filtering

In [56]:
df[df['population'] > 10000000]

Unnamed: 0,cities,population
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000


# Find a specific row - based on Index Number

Here you can find row data based on index number.

In [12]:
# Returns a <class 'pandas.core.series.Series'>
df.loc[2]

cities           Paris
population    11000000
Name: 2, dtype: object

# Find specific cell data - based on Index Number

In [13]:
df.loc[2]["population"]

11000000

# Find specific cell data - based on a search

- Here we find only the 'Sales Total' results for the 'Salesperson' named 'Paul'.
- The steps are broken down to show how the types returned.

In [14]:
df_sp = pd.DataFrame([["Paul",50],["Stacy",100],["Paul",80]], columns=["Salesperson","Sales Total"])
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


In [15]:
# Get all index numbers where "Salesperson" == "Paul"
paul_rows = df_sp[df_sp["Salesperson"] == "Paul"].index
print(type(paul_rows))
print(paul_rows)

# Pass "paul_rows" index numbers to 'df.loc[]' to return a series
paul_sales_totals = df_sp.loc[paul_rows]["Sales Total"]
print(type(paul_sales_totals))
print(paul_sales_totals)

#Convert series to a data frame
pd.DataFrame(paul_sales_totals)

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 2], dtype='int64')
<class 'pandas.core.series.Series'>
0    50
2    80
Name: Sales Total, dtype: int64


Unnamed: 0,Sales Total
0,50
2,80


# Add a row

In [16]:
# First get latest row number
next_row = len(df) 
print(next_row)
# Create a new row and add data
df.loc[next_row] = ['Berlin',3700000]
df

5


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000


# Add a row - based on previous row data

- Here we take the previous 'Investment Total' figure.
- We round it down to the nearest hundred.
- We calculate 2% of the rounded number.
- We then add the 2% to the previous 'Investment Total' in a newly created row:

In [17]:
import math

# Function to round figure down to the nearest hundred
def round_down_to_hundred(total):
    if total < 100:
        return 0
    total_reduced = str(math.floor(total/100))
    return int(total_reduced + "00")

#Test function works
round_down_to_hundred(23706.0)

23700

In [18]:
# New data frame
df_investment = pd.DataFrame([[45625],[45735]], columns=["Investment Total"])

# Get 'Investment Total' from most recent row
current_total = df_investment.loc[len(df_investment)-1]['Investment Total']

# Get next row number
next_row = len(df_investment)

# Create the next row adding 2% to the current total rounded down to the nearest hundred
df_investment.loc[next_row] = [current_total + ((round_down_to_hundred(current_total)) * 0.02)]

df_investment

Unnamed: 0,Investment Total
0,45625.0
1,45735.0
2,46649.0


# Update a cell - based on Index Number

In [19]:
# Update Madrid population and change it back.
df.at[0,'population'] = 5000000
print(df)
df.at[0,'population'] = 6000000
df

     cities  population
0    Madrid     5000000
1    London     9000000
2     Paris    11000000
3    Moscow    12000000
4  Istanbul    15000000
5    Berlin     3700000


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000


# Columns - adding a new Column

In [20]:
# Using df_sp (Salesperson sales totals from above)
# All columns will have the same data
df_sp['Example Column'] = 10
df_sp

Unnamed: 0,Salesperson,Sales Total,Example Column
0,Paul,50,10
1,Stacy,100,10
2,Paul,80,10


# Columns - Deleting a column

In [21]:
del df_sp['Example Column']
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


# Columns - create a new column from a sum

- Here we are going to create a 'Sales Quantity' column based on an assumption that the items being sold are valued at 5 each.

In [22]:
df_sp['Sales Quantity'] = df_sp['Sales Total'] / 5
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity
0,Paul,50,10.0
1,Stacy,100,20.0
2,Paul,80,16.0


# Columns - create a new column with the APPLY() method

- Here we create a new column based on applying a function to existing column data.

In [23]:
def divide_by_10(sales_total):
    return sales_total / 10
df_sp['New Sales Quantity'] = df_sp['Sales Total'].apply(divide_by_10)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10.0,5.0
1,Stacy,100,20.0,10.0
2,Paul,80,16.0,8.0


# Columns - change data type - float to integer

- We have some columns containing floats and some containing ints after previous column creation with calculations.

In [24]:
df_sp.dtypes

Salesperson            object
Sales Total             int64
Sales Quantity        float64
New Sales Quantity    float64
dtype: object

- We can change the type of columns.

In [28]:
df_sp['Sales Quantity'] = df_sp['Sales Quantity'].astype(int)
df_sp['New Sales Quantity'] = df_sp['New Sales Quantity'].astype(int)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10,5
1,Stacy,100,20,10
2,Paul,80,16,8


# Columns - create a new string column based on logic in APPLY() method

- Here we create a new string column based on sales performance.

In [35]:
def good_bad(sales_total):
    try:
        if sales_total > 60:
            return 'Good'
        else:
            return 'Bad'
    except:
        return 'N/A'
    
df_sp['Performance'] = df_sp['Sales Total'].apply(good_bad)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance
0,Paul,50,10,5,Bad
1,Stacy,100,20,10,Good
2,Paul,80,16,8,Good


# Columns and Dates - create a year and month column from a date

In [44]:
df_sp['Date'] = datetime(2022,1,7)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance,Date
0,Paul,50,10,5,Bad,2022-01-07
1,Stacy,100,20,10,Good,2022-01-07
2,Paul,80,16,8,Good,2022-01-07


In [54]:
def return_month(x):
    try:
        return x.month_name()
    except:
        return x
    
def return_year(x):
    try:
        return x.year
    except:
        return x

df_sp['Month'] = df_sp['Date'].apply(return_month)
df_sp['Year'] = df_sp['Date'].apply(return_year)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance,Date,Year,Month
0,Paul,50,10,5,Bad,2022-01-07,2022,January
1,Stacy,100,20,10,Good,2022-01-07,2022,January
2,Paul,80,16,8,Good,2022-01-07,2022,January


# Data Frame - use the APPLY() method on an entire dataframe

- It's more common to use the apply method on individual columns but you can apply functions to an entire dataframe.
- NOTE! This doesn't affect the existing dataframe - so you must ASSIGN to a dataframe.

In [40]:
df_nums = pd.DataFrame([[1,2],[3,4]], columns=['A','B'])
print(df_nums)
df_nums = df_nums.apply(lambda x: x + 5)
df_nums

   A  B
0  1  2
1  3  4


Unnamed: 0,A,B
0,6,7
1,8,9


# NaN - convert missing numerical data to 0
- NaN means missing data
- Any blank cells from a spreadsheet will be returned as NaN

In [29]:
# Using df_sp from above (Salesperson and Sales Total columns)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10,5
1,Stacy,100,20,10
2,Paul,80,16,8
