In [1]:
import pandas as pd
from datetime import datetime

# Data Frame - Creation with a Dictionary
When using a dictionary to create a dataframe, the key becomes a column. The array assigned to each key is the column data.

In [2]:
european_cities = {
    "cities":["Istanbul","Moscow","Paris","London","Madrid"],
    "population":[15000000,12000000,11000000,9000000,6000000]
}
df = pd.DataFrame(european_cities)
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Date Frame - Creation from a list of lists

In [3]:
european_cities_list = [
    ["Istanbul",15000000],
    ["Moscow",12000000],
    ["Paris",11000000],
    ["London",9000000],
    ["Madrid",6000000]
]
df = pd.DataFrame(european_cities_list, columns=["cities","population"])
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Data Frame - Creation from a list of lists - with unique Index
You can setup the dataframe to have a unique index instead of an auto incremented number.

In [4]:
df_unq_index = pd.DataFrame(
    [[15000000],[12000000],[11000000]], 
    index=['Istanbul','Moscow','Paris'], columns=['population'])
df_unq_index

Unnamed: 0,population
Istanbul,15000000
Moscow,12000000
Paris,11000000


# Data Frame - Creation from a CSV or Excel spreadsheet

In [5]:
# Save the population dataframe to a csv
df.to_csv('populations.csv')
# OPEN from a csv file
df = pd.read_csv('populations.csv')
df

Unnamed: 0.1,Unnamed: 0,cities,population
0,0,Istanbul,15000000
1,1,Moscow,12000000
2,2,Paris,11000000
3,3,London,9000000
4,4,Madrid,6000000


# Columns - Remove an unnecessary column

- See 'Index' below for how to drop an index when exporting to csv or xlsx files.

In [6]:
del df['Unnamed: 0']
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# Sort - apply a sort to existing dataframe

In [7]:
df.sort_values(by=["population"], ascending=True, inplace=True)
df

Unnamed: 0,cities,population
4,Madrid,6000000
3,London,9000000
2,Paris,11000000
1,Moscow,12000000
0,Istanbul,15000000


# Index - reset index numbers after a sort (or any function)

In [8]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000


# Index - Drop an Index from a dataframe when exporting
- Before saving a dataframe to another format such as an Excel spreadsheet, you may want to remove the index from the data frame.

In [9]:
df.to_csv('populations2.csv', index=False)

# Find a specific row - based on a search string

We want to find a row/ rows where the cities column is "Paris".
- This will return a DataFrame

In [10]:
df[df["cities"] == "Paris"]

Unnamed: 0,cities,population
2,Paris,11000000


# Find a specific row - based on Index Number

Here you can find row data based on index number.

In [11]:
# Returns a <class 'pandas.core.series.Series'>
df.loc[2]

cities           Paris
population    11000000
Name: 2, dtype: object

# Find specific cell data - based on Index Number

In [12]:
df.loc[2]["population"]

11000000

# Find specific cell data - based on a search

In [13]:
df_sp = pd.DataFrame([["Paul",50],["Stacy",100],["Paul",80]], columns=["Salesperson","Sales Total"])
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


In [14]:
# Get all index numbers where "Salesperson" == "Paul"
paul_rows = df_sp[df_sp["Salesperson"] == "Paul"].index
print(type(paul_rows))
print(paul_rows)

# Pass "paul_rows" index numbers to 'df.loc[]' to return a series
paul_sales_totals = df_sp.loc[paul_rows]["Sales Total"]
print(type(paul_sales_totals))
print(paul_sales_totals)

#Convert series to a data frame
pd.DataFrame(paul_sales_totals)

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 2], dtype='int64')
<class 'pandas.core.series.Series'>
0    50
2    80
Name: Sales Total, dtype: int64


Unnamed: 0,Sales Total
0,50
2,80


# Add a row

In [15]:
# First get latest row number
current_row = len(df) 
print(current_row)
# Create a new row and add data
df.loc[current_row] = ['Berlin',3700000]
df

5


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000


# Add a row - based on previous row data

- Here we take the previous 'Investment Total' figure.
- We round it down to the nearest hundred.
- We calculate 2% of the rounded number.
- We then add the 2% to the previous 'Investment Total' in a newly created row:

In [40]:
import math

# Function to round figure down to the nearest hundred
def round_down_to_hundred(total):
    if total < 100:
        return 0
    total_reduced = str(math.floor(total/100))
    return int(total_reduced + "00")

#Test function works
round_down_to_hundred(23706.0)

23700

In [41]:
# New data frame
df_investment = pd.DataFrame([[45625],[45735]], columns=["Investment Total"])

# Get 'Investment Total' from most recent row
current_total = df_investment.loc[len(df_investment)-1]['Investment Total']

# Get next row number
next_row = len(df_investment)

# Create the next row adding 2% to the current total rounded down to the nearest hundred
df_investment.loc[next_row] = [current_total + ((round_down_to_hundred(current_total)) * 0.02)]

df_investment

Unnamed: 0,Investment Total
0,45625.0
1,45735.0
2,46649.0


# Update a cell - based on Index Number

In [16]:
# Update Madrid population and change it back.
df.at[0,'population'] = 5000000
print(df)
df.at[0,'population'] = 6000000
df

     cities  population
0    Madrid     5000000
1    London     9000000
2     Paris    11000000
3    Moscow    12000000
4  Istanbul    15000000
5    Berlin     3700000


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000
