In [1]:
import pandas as pd

In [2]:
european_cities_list = [
    ["Istanbul",15000000],
    ["Moscow",12000000],
    ["Paris",11000000],
    ["London",9000000],
    ["Madrid",6000000]
]
df = pd.DataFrame(european_cities_list, columns=["cities","population"])

# 1. Find a specific row - based on a search string

We want to find a row/ rows where the cities column is "Paris".
- This will return a DataFrame

In [3]:
df[df["cities"] == "Paris"]

Unnamed: 0,cities,population
2,Paris,11000000


# 2. Find rows based on str.contains (similar to SQL like)
- Here we want to find companies that have profit losses in billions (marked with a B).
- We can use str.contains to filter strings.

In [4]:
df_global_sales = pd.DataFrame(
    {
        "company":["CompA","CompB"],
        "sales":["$12.2 B","$94.2 M"],
        "profit":["$3.2 B","$-2.5 B"]
    }
)
df_global_sales[df_global_sales['profit'].str.contains('-') & df_global_sales['profit'].str.contains('B')]

Unnamed: 0,company,sales,profit
1,CompB,$94.2 M,$-2.5 B


# 3. Find rows based on filtering numbers (greater than)

In [5]:
df[df['population'] > 10000000]

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000


#  4. Find a specific row - based on Index Number
- Here you can find row data based on index number.

In [6]:
# Returns a <class 'pandas.core.series.Series'>
df.loc[2]

cities           Paris
population    11000000
Name: 2, dtype: object

# 5. Find specific cell data - based on Index Number

In [7]:
df.loc[2]["population"]

11000000

# 6. Find index numbers - based on a search

- Here we find only the 'Sales Total' results for the 'Salesperson' named 'Paul'.
- The steps are broken down to show how the types returned.

In [8]:
df_sp = pd.DataFrame([["Paul",50],["Stacy",100],["Paul",80]], columns=["Salesperson","Sales Total"])
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


In [9]:
# Get all index numbers where "Salesperson" == "Paul"
paul_rows = df_sp[df_sp["Salesperson"] == "Paul"].index
print(type(paul_rows))
print(paul_rows)

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 2], dtype='int64')


# 7. Find specific cell data - from a list of multiple Index Numbers

In [10]:
# Pass "paul_rows" index numbers to 'df.loc[]' to return a series
paul_sales_totals = df_sp.loc[paul_rows]["Sales Total"]
print(type(paul_sales_totals))
print(paul_sales_totals)

<class 'pandas.core.series.Series'>
0    50
2    80
Name: Sales Total, dtype: int64


# 8. Create a Boolean mask (True or False filtering)
- Here, we create a mask that returns True or False for every row in the table.
- Then we can perform a calculation on all True rows.

In [11]:
# Create Boolean mask - returns True or False for every row if it matches 'Paul'
mask = df_sp['Salesperson'] == 'Paul'

mask

0     True
1    False
2     True
Name: Salesperson, dtype: bool

In [12]:
# Calculate the 'Mean' for all rows that return True
df_sp[mask]['Sales Total'].mean() 

65.0

# 9. Series - convert a series to a Dataframe or a List
- Getting specific columns returns a series rather than a dataframe
- You can also convert a Series to a tuple, set or dictionary

In [13]:
series_population = df.loc[[0,1]]['population']
type(series_population)

pandas.core.series.Series

In [14]:
# Convert series to a dataframe
pd.DataFrame(series_population)

Unnamed: 0,population
0,15000000
1,12000000


In [15]:
# Convert a series to a list - returns list of series values
list(series_population)

[15000000, 12000000]

# 10. Add a row

In [16]:
# First get latest row number
next_row = len(df) 
print(next_row)

# Create a new row and add data
df.loc[next_row] = ['Berlin',3700000]
df

5


Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000
5,Berlin,3700000


# 11. Add a row - based on previous row data

- Here we take the previous 'Investment Total' figure.
- We round it down to the nearest hundred.
- We calculate 2% of the rounded number.
- We then add the 2% to the previous 'Investment Total' in a newly created row:

In [18]:
import math

# Function to round figure down to the nearest hundred
def round_down_to_hundred(total):
    if total < 100:
        return 0
    total_reduced = str(math.floor(total/100))
    return int(total_reduced + "00")

#Test function works
round_down_to_hundred(23706.0)

23700

In [19]:
# New data frame
df_investment = pd.DataFrame([[45625],[45735]], columns=["Investment Total"])

# Get 'Investment Total' from most recent row
current_total = df_investment.loc[len(df_investment)-1]['Investment Total']

# Get next row number
next_row = len(df_investment)

# Create the next row adding 2% to the current total rounded down to the nearest hundred
df_investment.loc[next_row] = [current_total + ((round_down_to_hundred(current_total)) * 0.02)]

df_investment

Unnamed: 0,Investment Total
0,45625.0
1,45735.0
2,46649.0


# 12. Update a cell - based on Index Number

In [21]:
# Update Istanbul population and change it back.
df.at[0,'population'] = 5000000
print(df)
df.at[0,'population'] = 6000000
df

     cities  population
0  Istanbul     5000000
1    Moscow    12000000
2     Paris    11000000
3    London     9000000
4    Madrid     6000000
5    Berlin     3700000


Unnamed: 0,cities,population
0,Istanbul,6000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000
5,Berlin,3700000
