# Pandas

> pip install pandas

In [35]:
import pandas as pd
import numpy as np

## 1. Create DataFrame

### 1. From Dictionary

In [36]:
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"]
}

# Create DataFrame
df = pd.DataFrame(data)

# Show top 5x records
df.head()

Unnamed: 0,ID,name
0,1,Thomas
1,2,Ingo
2,3,Sara
3,4,Julia
4,5,Lena


In [37]:
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"],
    "city" : "Berlin" # same value for all rows
}

# Create DataFrame
df = pd.DataFrame(data)

# Show top 5x records
df.head()

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Sara,Berlin
3,4,Julia,Berlin
4,5,Lena,Berlin


In [38]:
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"],
    "city" : "Berlin", # same value for all rows
    "score": np.array([3,5,2,7,1]) # also Numpy array can be used
}

# Create DataFrame
df = pd.DataFrame(data)

# Show top 5x records
df.head()

Unnamed: 0,ID,name,city,score
0,1,Thomas,Berlin,3
1,2,Ingo,Berlin,5
2,3,Sara,Berlin,2
3,4,Julia,Berlin,7
4,5,Lena,Berlin,1


In [39]:
# Re-Arrange the columns during creating the DataFrame
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"],
    "city" : "Berlin", # same value for all rows
    "score": np.array([3,5,2,7,1]) # also Numpy array can be used
}

# Create DataFrame
df = pd.DataFrame(data, columns = ["city", "ID", "score", "name"])


# Show top 5x records
df.head()

Unnamed: 0,city,ID,score,name
0,Berlin,1,3,Thomas
1,Berlin,2,5,Ingo
2,Berlin,3,2,Sara
3,Berlin,4,7,Julia
4,Berlin,5,1,Lena


In [40]:
# Get Only certain specific columns from my data source
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"],
    "city" : "Berlin", # same value for all rows
    "score": np.array([3,5,2,7,1]) # also Numpy array can be used
}

# Create DataFrame
df = pd.DataFrame(data, columns = ["city",  "name"])

# Show top 5x records
df.head()

Unnamed: 0,city,name
0,Berlin,Thomas
1,Berlin,Ingo
2,Berlin,Sara
3,Berlin,Julia
4,Berlin,Lena


### From List

In [41]:
my_list = [
    {"ID": 1, "name": "Thomas", "city": "Berlin"},
    {"ID": 2, "name": "Ingo", "city": "Berlin"},
    {"ID": 3, "name": "Lena", "city": "Berlin"},
    
]

# Create DataFrame
df = pd.DataFrame(my_list)

# Show top 5x records
df.head()

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Lena,Berlin


In [42]:
my_list = [
    [1, "Thomas", "Berlin"],
    [2, "Ingo", "Berlin"],
    [3, "Lena", "Berlin"],
]

# Create DataFrame
df = pd.DataFrame(my_list, columns = ["ID", "name", "city"])  # Give Manual Column Names

# Show top 5x records
df.head()

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Lena,Berlin


### From JSON

In [43]:
data = {
    "ID" : {
    "0": 100,
    "1": 101,
    "2": 102,
    },
    "name" : {
    "0": "Thomas",
    "1": "Ingo",
    "2": "Lena",
    },
}

# Create DataFrame
df = pd.DataFrame(data) 

# Show top 5x records
df.head()

Unnamed: 0,ID,name
0,100,Thomas
1,101,Ingo
2,102,Lena


## Indexing

In [44]:
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"]
}


# Create DataFrame
df = pd.DataFrame(data, index = ["a", "b", "c", "d", "e"]) # use specific/custom indexes

# Show top 5x records
df.head()

Unnamed: 0,ID,name
a,1,Thomas
b,2,Ingo
c,3,Sara
d,4,Julia
e,5,Lena


In [45]:
# After creating the DataFrame --> Specify the index columns
df2 = df.set_index("name")
df2.head()

Unnamed: 0_level_0,ID
name,Unnamed: 1_level_1
Thomas,1
Ingo,2
Sara,3
Julia,4
Lena,5


# Create DataFrame from Extern

In [46]:
df = pd.read_csv("my_data.csv")
# df = pd.read_csv("my_data.csv", delimiter= ",")
# df = pd.read_csv("my_data.csv", delimiter= ";")
# df = pd.read_csv("my_data.csv", delimiter= "\t")
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558.0,USB-C Charging Cable,2.0,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
1,,,,,,
2,176559.0,Bose SoundSport Headphones,1.0,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560.0,Google Phone,1.0,600.0,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560.0,Wired Headphones,1.0,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [47]:
# Use specific column from csv file as index-column
df = pd.read_csv("my_data.csv", index_col = 0 ) # 0 : is the order of the column
df = pd.read_csv("my_data.csv", index_col = "Product" ) # "Product" is the name of the column
df.head()

Unnamed: 0_level_0,Order ID,Quantity Ordered,Price Each,Order Date,Purchase Address
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
USB-C Charging Cable,176558.0,2.0,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
,,,,,
Bose SoundSport Headphones,176559.0,1.0,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
Google Phone,176560.0,1.0,600.0,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
Wired Headphones,176560.0,1.0,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [48]:
# After creating the DataFrame --> Specify the index columns
df2 = df.set_index("Order Date")
df2.head()

Unnamed: 0_level_0,Order ID,Quantity Ordered,Price Each,Purchase Address
Order Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
04/19/19 08:46,176558.0,2.0,11.95,"917 1st St, Dallas, TX 75001"
,,,,
04/07/19 22:30,176559.0,1.0,99.99,"682 Chestnut St, Boston, MA 02215"
04/12/19 14:38,176560.0,1.0,600.0,"669 Spruce St, Los Angeles, CA 90001"
04/12/19 14:38,176560.0,1.0,11.99,"669 Spruce St, Los Angeles, CA 90001"


In [49]:
# Without Header > The first row will be as normal data
df = pd.read_csv("my_data.csv", header= None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
1,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
2,,,,,,
3,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
4,176560,Google Phone,1,600,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [50]:
# Specify the line number of the header -> or start reading the data from specific row number
df = pd.read_csv("my_data.csv", header= 0)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558.0,USB-C Charging Cable,2.0,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
1,,,,,,
2,176559.0,Bose SoundSport Headphones,1.0,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560.0,Google Phone,1.0,600.0,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560.0,Wired Headphones,1.0,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [51]:
# Use Specific Column Names for my DataFrame
df = pd.read_csv("my_data.csv", names = ("A", "B", "C", "D", "E", "F"))

df.head()

Unnamed: 0,A,B,C,D,E,F
0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
1,176558,USB-C Charging Cable,2,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
2,,,,,,
3,176559,Bose SoundSport Headphones,1,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
4,176560,Google Phone,1,600,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [52]:
# Ready only some columns
df = pd.read_csv("my_data.csv", usecols=["Order ID", "Product", "Price Each"])
df.head()

Unnamed: 0,Order ID,Product,Price Each
0,176558.0,USB-C Charging Cable,11.95
1,,,
2,176559.0,Bose SoundSport Headphones,99.99
3,176560.0,Google Phone,600.0
4,176560.0,Wired Headphones,11.99


# DataFrame Information

In [53]:
df = pd.read_csv("my_data.csv")

## Show Rows

In [54]:
df.head() # Show Top 5x Records
df.head(2) # Show Top 2x Records

df.tail() # Show the last 5x records
df.tail(2) # Show the last 2x records


df.sample()# random 1x record
df.sample(4) # random 4x records

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
75839,149134,USB-C Charging Cable,1,11.95,01/14/19 16:43,"821 West St, Los Angeles, CA 90001"
139030,280625,AAA Batteries (4-pack),3,2.99,11/14/19 11:50,"28 10th St, Seattle, WA 98101"
68969,142588,AA Batteries (4-pack),1,3.84,01/31/19 16:27,"916 5th St, San Francisco, CA 94016"
178388,251243,Apple Airpods Headphones,1,150.0,10/01/19 00:09,"854 2nd St, Los Angeles, CA 90001"


## Show Infos

In [55]:
# Normally .describe() is for numerical columns
df.describe() # 

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
count,186305,186305,186305,186305.0,186305,186305
unique,178438,20,10,24.0,142396,140788
top,Order ID,USB-C Charging Cable,1,11.95,Order Date,Purchase Address
freq,355,21903,168552,21903.0,355,355


In [56]:
# .info() -> Columns Names, DataTypes, Null Values Count
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186850 entries, 0 to 186849
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Order ID          186305 non-null  object
 1   Product           186305 non-null  object
 2   Quantity Ordered  186305 non-null  object
 3   Price Each        186305 non-null  object
 4   Order Date        186305 non-null  object
 5   Purchase Address  186305 non-null  object
dtypes: object(6)
memory usage: 8.6+ MB


In [57]:
# dtypes --> read only Data Types
df.dtypes

Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
dtype: object

In [58]:
# Shape (row_count, column_count)
df.shape 


# df.shape[0] # count rows
# df.shape[1] # count columns

(186850, 6)

In [59]:
# size: count of items (rows * columns) ----> Achtung:  for non NAN/Null Value
df.size

1121100

## Show Columns

In [60]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,176558.0,USB-C Charging Cable,2.0,11.95,04/19/19 08:46,"917 1st St, Dallas, TX 75001"
1,,,,,,
2,176559.0,Bose SoundSport Headphones,1.0,99.99,04/07/19 22:30,"682 Chestnut St, Boston, MA 02215"
3,176560.0,Google Phone,1.0,600.0,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"
4,176560.0,Wired Headphones,1.0,11.99,04/12/19 14:38,"669 Spruce St, Los Angeles, CA 90001"


In [61]:
df["Product"]
df["Price Each"]

# Alternative -> only for columns names without spaces
df.Product

0               USB-C Charging Cable
1                                NaN
2         Bose SoundSport Headphones
3                       Google Phone
4                   Wired Headphones
                     ...            
186845        AAA Batteries (4-pack)
186846                        iPhone
186847                        iPhone
186848        34in Ultrawide Monitor
186849          USB-C Charging Cable
Name: Product, Length: 186850, dtype: object

In [62]:
# Show Multi-Columns
df[ ["Product", "Price Each"]   ]  # List in List

Unnamed: 0,Product,Price Each
0,USB-C Charging Cable,11.95
1,,
2,Bose SoundSport Headphones,99.99
3,Google Phone,600
4,Wired Headphones,11.99
...,...,...
186845,AAA Batteries (4-pack),2.99
186846,iPhone,700
186847,iPhone,700
186848,34in Ultrawide Monitor,379.99


## Info about Columns

In [63]:
# Unique Values
df["Product"].unique() # Get Unique Values as a array(list)
len(df["Product"].unique()) # 21x Using python len() Count of all unique values

df["Product"].nunique() # 20x Return the number of unique values without NAN/Null value


# Counts
df["Product"].value_counts() # Return Series containing counts of unique values
df["Product"].count() # Count of rows (Not NAN/Null)




186305

In [64]:
# Checks
df["Product"].isnull() # for each row-> gives True or false

# Count of Null Rows -> Sum() only the True values
df["Product"].isnull().sum() # 545x rows are null   , denn sum() -> sum only the True values (for each True = 1)


545

In [65]:
# Check 
df["Product"].is_unique # False
df["Order ID"].is_unique # False , because of repeated Header

False

## Get Unique values for each column dynamically

In [66]:
# Loop over DataFrame -> Loop Over Columns

for column in df:
    print(column, "-->", df[column].unique(), "\n")

Order ID --> ['176558' nan '176559' ... '259355' '259356' '259357'] 

Product --> ['USB-C Charging Cable' nan 'Bose SoundSport Headphones' 'Google Phone'
 'Wired Headphones' 'Macbook Pro Laptop' 'Lightning Charging Cable'
 '27in 4K Gaming Monitor' 'AA Batteries (4-pack)'
 'Apple Airpods Headphones' 'AAA Batteries (4-pack)' 'iPhone'
 'Flatscreen TV' '27in FHD Monitor' '20in Monitor' 'LG Dryer'
 'ThinkPad Laptop' 'Vareebadd Phone' 'LG Washing Machine'
 '34in Ultrawide Monitor' 'Product'] 

Quantity Ordered --> ['2' nan '1' '3' '5' 'Quantity Ordered' '4' '7' '6' '8' '9'] 

Price Each --> ['11.95' nan '99.99' '600' '11.99' '1700' '14.95' '389.99' '3.84' '150'
 '2.99' '700' '300' '149.99' '109.99' '600.0' '999.99' '400' '379.99'
 'Price Each' '700.0' '1700.0' '150.0' '300.0' '400.0'] 

Order Date --> ['04/19/19 08:46' nan '04/07/19 22:30' ... '09/23/19 07:39'
 '09/19/19 17:30' '09/30/19 00:18'] 

Purchase Address --> ['917 1st St, Dallas, TX 75001' nan '682 Chestnut St, Boston, MA 02215'
 .

# Filter Data

In [67]:
# Create DataFrame
data = {
    "ID": [1,2,3,4,5],
    "name": ["Thomas", "Ingo", "Sara", "Julia", "Lena"],
    "city" : "Berlin" # same value for all rows
}

# Create DataFrame
df = pd.DataFrame(data)

# Show top 5x records
df.head()

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Sara,Berlin
3,4,Julia,Berlin
4,5,Lena,Berlin


In [68]:
df["name"] == "Thomas" # Gives a Serie of True/False for each Row



0     True
1    False
2    False
3    False
4    False
Name: name, dtype: bool

In [69]:
# Store the Serie in a variable called bool_serie
bool_serie = df["name"] == "Thomas" # Gives a Serie of True/False for each Row
df[  bool_serie   ] # Get only the True records from the bool_serie

# Alternative  df[ condition ]
df[  df["name"] == "Thomas"   ]
df[  df["city"] == "Berlin"   ]

# Better alternative (better for multiple conditions)
df.loc[ df["name"] == "Thomas"  ]

df.loc[ (df["name"] == "Thomas") & (df["city"] == "Berlin") ] # and several conditions
df.loc[ (df["name"] == "Thomas") & (df["city"] == "Hamburg") ] # and several conditions
df.loc[ (df["name"] == "Thomas") | (df["city"] == "Hamburg") ] # or several conditions
df.loc[ (df["name"] == "Thomas") | (df["city"] != "Hamburg") ] # or several conditions

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Sara,Berlin
3,4,Julia,Berlin
4,5,Lena,Berlin


# Loops Over DataFrame

In [70]:
# Loop over DataFrame (Columns) 
for column in df:
    print(column)

ID
name
city


In [71]:
# Loop Over DataFrame (Rows)

for index, row in df.iterrows():  # row is a serie for each ROW in the DataFrame
    print(index)
    print(row) # contains : Column Name and Value
    print()

0
ID           1
name    Thomas
city    Berlin
Name: 0, dtype: object

1
ID           2
name      Ingo
city    Berlin
Name: 1, dtype: object

2
ID           3
name      Sara
city    Berlin
Name: 2, dtype: object

3
ID           4
name     Julia
city    Berlin
Name: 3, dtype: object

4
ID           5
name      Lena
city    Berlin
Name: 4, dtype: object



In [72]:
# Loop Over DataFrame (Rows)
for index, row in df.iterrows():  # row is a serie for each ROW in the DataFrame
    print(index)
    print(row.values) # only the values like [1 'Thomas' 'Berlin']
    print()

0
[1 'Thomas' 'Berlin']

1
[2 'Ingo' 'Berlin']

2
[3 'Sara' 'Berlin']

3
[4 'Julia' 'Berlin']

4
[5 'Lena' 'Berlin']



# Sorting

In [73]:
# Sort the DataFrame and Store the Sorted back again
df = df.sort_values("name")

df = df.sort_values("name", ascending= False)


# Sort based on several columns
df = df.sort_values(["city", "name"], ascending= False)

In [74]:
# Sort by Index
df = df.sort_index()

In [75]:
df.head()

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
2,3,Sara,Berlin
3,4,Julia,Berlin
4,5,Lena,Berlin


# Concatenate Several DataFrames

In [76]:
data1 = {
    "ID": [1,2],
    "name": ["Thomas", "Ingo"],
    "city" : "Berlin" # same value for all rows
}

data2 = {
    "ID": [3,4],
    "name": ["Julia", "Lena"],
    "city" : "Aachen" # same value for all rows
}

data3 = {
    "ID": [5,6,7],
    "name": ["Frank", "Mattias", "Alex"],
    "city" : "Frankfurt" # same value for all rows
}

# Create DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)


In [77]:
# Concatenate the DataFrames together (over each other)
df = pd.concat( [ df1, df2, df3 ] )

df.head(6)

Unnamed: 0,ID,name,city
0,1,Thomas,Berlin
1,2,Ingo,Berlin
0,3,Julia,Aachen
1,4,Lena,Aachen
0,5,Frank,Frankfurt
1,6,Mattias,Frankfurt


In [78]:
# Concatenate the DataFrames together (next to each other)
df = pd.concat( [ df1, df2, df3 ], axis = 1 )

df.head(6)

Unnamed: 0,ID,name,city,ID.1,name.1,city.1,ID.2,name.2,city.2
0,1.0,Thomas,Berlin,3.0,Julia,Aachen,5,Frank,Frankfurt
1,2.0,Ingo,Berlin,4.0,Lena,Aachen,6,Mattias,Frankfurt
2,,,,,,,7,Alex,Frankfurt


# Data Types

In [79]:
data = {
    "ID": ["1","2", "3"],
    "name": ["Thomas", "Ingo", "Julia"],
    "price" : ["12.2", "13.3", "14.4"]
}


df = pd.DataFrame(data)

df.head()
df.info() # -> Problem because column ID and price are strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3 non-null      object
 1   name    3 non-null      object
 2   price   3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


In [80]:
# Convert the Data Types
df["ID"] =  df["ID"].astype(int)
df["price"] =  df["price"].astype(float)


# Alternative
#~~~~~~~~~~~~~
df["ID"] = pd.to_numeric(df["ID"])
df["price"] = pd.to_numeric(df["price"])

# data_frame["Order Date"] = pd.to_datetime(data_frame["Order Date"])

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      3 non-null      int32  
 1   name    3 non-null      object 
 2   price   3 non-null      float64
dtypes: float64(1), int32(1), object(1)
memory usage: 188.0+ bytes


In [82]:
df.describe()

Unnamed: 0,ID,price
count,3.0,3.0
mean,2.0,13.3
std,1.0,1.1
min,1.0,12.2
25%,1.5,12.75
50%,2.0,13.3
75%,2.5,13.85
max,3.0,14.4


# Save to CSV File

In [83]:
df.to_csv("output.csv")
df.to_csv("output.csv", index = False) # without index
df.to_csv("output.csv", index = False, sep = ";") # with custom separator instead of comma
df.to_csv("output.csv", index = False, sep = "|") # with custom separator instead of comma

# Drop

To delete rows or columns

In [84]:
data = {
    "ID": [1,2,3,4,5,6,7],
    "name" : ["Thomas", "Ingo", "Sara", "Lena", "Julia", "Frank", "Matthias"]
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,ID,name
0,1,Thomas
1,2,Ingo
2,3,Sara
3,4,Lena
4,5,Julia


In [85]:
# Drop columns
df.drop(columns = ["ID"])
df.drop(columns = ["ID", "name"])

0
1
2
3
4
5
6


In [86]:
# Drop Rows
df.drop(index = [1, 3])

Unnamed: 0,ID,name
0,1,Thomas
2,3,Sara
4,5,Julia
5,6,Frank
6,7,Matthias


In [87]:
# ignore: if the index is not there DON'T throw and ERROR 
#df.drop(index = [1, 2 ,3, 900])  # -> ERROR 
df.drop(index = [1, 2 ,3, 900], errors = "ignore")

Unnamed: 0,ID,name
0,1,Thomas
4,5,Julia
5,6,Frank
6,7,Matthias


# DropNA

delete NA Rows/Columns
1. axis = "rows", "columns" 
2. axis =   0 , 1

delete NA Rows/Columns
1. how = "any" , "all" 


In [88]:
data = {
    "ID": [1,np.nan,3,np.nan,5,6,np.nan],
    "name" : [np.nan, "Ingo", np.nan, "Lena", "Julia", "Frank",np.nan],
    "city" : np.nan
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,ID,name,city
0,1.0,,
1,,Ingo,
2,3.0,,
3,,Lena,
4,5.0,Julia,


In [89]:
df.dropna()

Unnamed: 0,ID,name,city


In [90]:
df.dropna(axis = "columns") # axis = 1
df.dropna(axis = 1) # axis = 1

0
1
2
3
4
5
6


In [91]:
df.dropna(axis = "columns", how = "all") #- > will delete the column city

Unnamed: 0,ID,name
0,1.0,
1,,Ingo
2,3.0,
3,,Lena
4,5.0,Julia
5,6.0,Frank
6,,


In [92]:
df.dropna(axis = "rows", how = "any")

Unnamed: 0,ID,name,city


In [93]:
# subset() --> consider only specific columns
df.dropna(axis = "rows", how = "any", subset = ["name", "ID"])

Unnamed: 0,ID,name,city
4,5.0,Julia,
5,6.0,Frank,


# FillNA

In [94]:
data = {
    "ID": [1, np.nan, 2, np.nan, 3,4,5],
    "name": ["Thomas", np.nan, "Ingo",np.nan , "Sara", "Julia", "Thomas"],
    "score" : [25,np.nan , np.nan, 14, 64,21,14]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,,,
2,2.0,Ingo,
3,,,14.0
4,3.0,Sara,64.0


In [95]:
# fill with specific value
df2 = df.fillna(0)
df2.head()


Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,0.0,0,0.0
2,2.0,Ingo,0.0
3,0.0,0,14.0
4,3.0,Sara,64.0


In [96]:
# fill each column with specific value
df2 = df.fillna({
    "name": "NOT FOUND",
    "score" : 0
})

df2.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,,NOT FOUND,0.0
2,2.0,Ingo,0.0
3,,NOT FOUND,14.0
4,3.0,Sara,64.0


In [97]:
df.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,,,
2,2.0,Ingo,
3,,,14.0
4,3.0,Sara,64.0


In [98]:
df2 = df.fillna(method= "ffill")  ## ffill: forward fill
df2 = df.fillna(method= "bfill")  ## bfill: backward fill


df2 = df.fillna(method= "ffill", axis = "columns")  # from column to next column
df2 = df.fillna(method= "bfill", axis = "columns")  # from column to previous column


df2 = df.fillna(method= "ffill", limit = 1)  ## limit: how many items to be filled


df2.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,1.0,Thomas,25.0
2,2.0,Ingo,
3,2.0,Ingo,14.0
4,3.0,Sara,64.0


In [99]:
# Example of filling values with different approaches
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
data = {
    "ID": [1, np.nan, 2, np.nan, 3,4,5],
    "name": ["Thomas", np.nan, "Ingo",np.nan , "Sara", "Julia","Thomas"],
    "score" : [25,np.nan , np.nan, 14, 64,21,14]
}

df = pd.DataFrame(data)

# !Achtung: Es macht kein Sinn für ID mit Interporate..aber wir wollen gerne Interpolate hier einfach sehen
df["ID"] = df["ID"].interpolate()

# Get the most freq name
most_freq_name = df["name"].mode()[0]   

# Get the mean (average) of a score
mean_score = df["score"].mean() # 27.6

df2 = df.fillna({
    "name": most_freq_name,
    "score" : mean_score
})


df2.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,1.5,Thomas,27.6
2,2.0,Ingo,27.6
3,2.5,Thomas,14.0
4,3.0,Sara,64.0


# Interpolate

In [100]:
df2 = df.interpolate()
df2.head()

Unnamed: 0,ID,name,score
0,1.0,Thomas,25.0
1,1.5,,21.333333
2,2.0,Ingo,17.666667
3,2.5,,14.0
4,3.0,Sara,64.0


# Duplicate

In [101]:
data = {
    "ID": [1,2, 2, 4, 4],
    "name" : ["Thomas", "Ingo", "Ingo", "Lena", "Lena"]
    
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,ID,name
0,1,Thomas
1,2,Ingo
2,2,Ingo
3,4,Lena
4,4,Lena


In [102]:
# Get Bool Serie (True, False) for each row
df.duplicated()
df.duplicated(keep = "first") # Default
df.duplicated(keep = "last")

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [103]:
# Get the Duplicated ROWS
df[ df.duplicated() ]

Unnamed: 0,ID,name
2,2,Ingo
4,4,Lena


# Drop_duplicates()

In [104]:
data = {
    "ID": [1,2, 2, 4, 4],
    "name" : ["Thomas", "Ingo", "Ingo", "Lena", "Lena"]
    
}

df = pd.DataFrame(data)

df.head()



Unnamed: 0,ID,name
0,1,Thomas
1,2,Ingo
2,2,Ingo
3,4,Lena
4,4,Lena


In [105]:
df.drop_duplicates()
df.drop_duplicates(keep = "first")
df.drop_duplicates(keep = "last")
df.drop_duplicates(subset = ["name", "ID"]) # consider only the duplicated rows in the specific columns

Unnamed: 0,ID,name
0,1,Thomas
1,2,Ingo
3,4,Lena


# Pandas Series String

In [106]:
df = pd.DataFrame({
    "ID": [1,2,3,4],
    "name" :[ "thomas meier", "ali Meier    ", "     ingo mEier Möller", " Steffi"  ],
    "score": [80,51,63,42]
})

df.head()

Unnamed: 0,ID,name,score
0,1,thomas meier,80
1,2,ali Meier,51
2,3,ingo mEier Möller,63
3,4,Steffi,42


In [107]:
df.name.str.title()
df.name.str.upper()
df.name.str.lower()
df.name.str.strip()
df.name.str.capitalize()

# startswith()
df.name.str.startswith("t")
df.loc[ df.name.str.startswith("t") ]

# replace
df.name.str.replace("Me", "KK")
df.name.str.replace(" ", "_") # replace each single white space with '_' 

# split
df.name.str.split() #-> one single column, we will have a list of words
df.name.str.split(expand = True) #-> each splitted word will get an own column

# contains
df.name.str.contains("m") # only small m
df.loc[df.name.str.contains("m")]


df.name.str.contains("m|M") # m small or M capital
df.loc[df.name.str.contains("m|M")]

# len
df.name.str.len() > 10
df.loc[df.name.str.len() > 10 ]

Unnamed: 0,ID,name,score
0,1,thomas meier,80
1,2,ali Meier,51
2,3,ingo mEier Möller,63


# Correct the Name of the columns and replace white space with '_'


In [108]:
# Exercise

df = pd.DataFrame({
    "ID": [1,2,3,4],
    "emp name" :[ "thomas meier", "ali Meier    ", "     ingo mEier Möller", " Steffi"  ],
    "emp score": [80,51,63,42]
})
df.head()

Unnamed: 0,ID,emp name,emp score
0,1,thomas meier,80
1,2,ali Meier,51
2,3,ingo mEier Möller,63
3,4,Steffi,42


In [109]:
# your code
df.columns



Index(['ID', 'emp name', 'emp score'], dtype='object')

# Merging (Joining) Several DataFrames

In [110]:
cities = {
    "city_code" : ["FFM", "AA", "BO", "STD"],
    "city_name" : ["Frankfurt am Main", "Aachen", "Bonn", "Stuttgart"]
}

data_emp = {
    "id": [1,2,3,4],
    "name": ["Thomas", "Ingo", "Sara", "Julia"],
    "city_code" : ["FFM", "AA", "BO", "MU"]
}

df_cities = pd.DataFrame(cities)
df_emp = pd.DataFrame(data_emp)

df_cities.head()
df_emp.head()

Unnamed: 0,id,name,city_code
0,1,Thomas,FFM
1,2,Ingo,AA
2,3,Sara,BO
3,4,Julia,MU


In [111]:
# Merge the two tables
df_emp.merge(df_cities, how = "inner", left_on= "city_code", right_on= "city_code")

Unnamed: 0,id,name,city_code,city_name
0,1,Thomas,FFM,Frankfurt am Main
1,2,Ingo,AA,Aachen
2,3,Sara,BO,Bonn


In [112]:
df_emp.merge(df_cities, how = "left", left_on= "city_code", right_on= "city_code")

Unnamed: 0,id,name,city_code,city_name
0,1,Thomas,FFM,Frankfurt am Main
1,2,Ingo,AA,Aachen
2,3,Sara,BO,Bonn
3,4,Julia,MU,


In [113]:
df_emp.merge(df_cities, how = "right", left_on= "city_code", right_on= "city_code")

Unnamed: 0,id,name,city_code,city_name
0,1.0,Thomas,FFM,Frankfurt am Main
1,2.0,Ingo,AA,Aachen
2,3.0,Sara,BO,Bonn
3,,,STD,Stuttgart


# Inplace

in Pandas --> you can't change the DataFrame easily..

Two options are available
1. Do make changes directly on the same container --> inplace Changes. it needs to explizity till pandas to change the original DataFrame using inplace Parameter

2. Save the changes into another variable or on the same variable name

In [114]:
cities = {
    "city_code" : ["FFM", "AA", "BO", "STD"],
    "city_name" : ["Frankfurt am Main", "Aachen", "Bonn", "Stuttgart"],
    "wert" : 5
}

df = pd.DataFrame(cities)

# Shows only a View --> without any changes
df.drop(columns= ["city_code"])

# 1. Option, using inplace
df.drop(columns= ["city_code"], inplace= True)


# 2. Option, save back to the same variable name
df = df.drop(columns= ["wert"])

df.head()

Unnamed: 0,city_name
0,Frankfurt am Main
1,Aachen
2,Bonn
3,Stuttgart


# Group By

In [115]:
participants = [
    (10, "Thomas", 40, "Berlin", 70),
    (11, "Ingo", 40, "Berlin", 80),
    (12, "Sara", 40, "Berlin", 90),
    (13, "Lena", 40, "Frankfurt", 71),
    (14, "Julia", 40, "Frankfurt", 72),
    (15, "Frank", 40, "Frankfurt", 73),
    (16, "Matthias", 40, "Aachen", 75),
    (17, "Ali", 40, "Aachen", 76),
    (18, "Ahmed", 40, "Aachen", 77),
    (19, "Sabine", 40, "Stuttgart", 78),
    (20, "Steffi", 40, "Stuttgart", 79),
]

df = pd.DataFrame(participants, columns = ["ID", "name", "age", "city", "score"])

# Set the index
df = df.set_index("ID")

df.head(20)



Unnamed: 0_level_0,name,age,city,score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,Thomas,40,Berlin,70
11,Ingo,40,Berlin,80
12,Sara,40,Berlin,90
13,Lena,40,Frankfurt,71
14,Julia,40,Frankfurt,72
15,Frank,40,Frankfurt,73
16,Matthias,40,Aachen,75
17,Ali,40,Aachen,76
18,Ahmed,40,Aachen,77
19,Sabine,40,Stuttgart,78


In [116]:
# Create Groups based on a column
groups = df.groupby("city") # creates three groups
print(groups)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C982C10520>


In [117]:
# Loop over the groups

for group_name, rows in groups:
    print("Group Name: " , group_name)
    print("Group Rows: ")
    print(rows, "\n")



Group Name:  Aachen
Group Rows: 
        name  age    city  score
ID                              
16  Matthias   40  Aachen     75
17       Ali   40  Aachen     76
18     Ahmed   40  Aachen     77 

Group Name:  Berlin
Group Rows: 
      name  age    city  score
ID                            
10  Thomas   40  Berlin     70
11    Ingo   40  Berlin     80
12    Sara   40  Berlin     90 

Group Name:  Frankfurt
Group Rows: 
     name  age       city  score
ID                              
13   Lena   40  Frankfurt     71
14  Julia   40  Frankfurt     72
15  Frank   40  Frankfurt     73 

Group Name:  Stuttgart
Group Rows: 
      name  age       city  score
ID                               
19  Sabine   40  Stuttgart     78
20  Steffi   40  Stuttgart     79 



In [118]:
# Get the counf or rows for each group
groups.size()

city
Aachen       3
Berlin       3
Frankfurt    3
Stuttgart    2
dtype: int64

In [119]:
# Get a certain group
group_aachen = groups.get_group("Aachen")
group_aachen

Unnamed: 0_level_0,name,age,city,score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
16,Matthias,40,Aachen,75
17,Ali,40,Aachen,76
18,Ahmed,40,Aachen,77


In [120]:
# What’s new in 1.5.0 (September 19, 2022): https://pandas.pydata.org/docs/whatsnew/v1.5.0.html
# Get the mean value of all numeric columns in each group
# mean_values = groups.mean()  # comes a Future Warning -> FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated
mean_values = groups.mean(numeric_only = True) # without future warning
mean_values

Unnamed: 0_level_0,age,score
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Aachen,40.0,76.0
Berlin,40.0,80.0
Frankfurt,40.0,72.0
Stuttgart,40.0,78.5


In [121]:
# Do aggregeation functions on each column separately
results = groups.agg({
            "age" : "mean",
            "score" : "sum"
            })

results


Unnamed: 0_level_0,age,score
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Aachen,40.0,228
Berlin,40.0,240
Frankfurt,40.0,216
Stuttgart,40.0,157


In [122]:
# FutureWarning: ['name'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
# results = groups.agg(["sum", "mean", "max"])

results = groups["age"].agg(["sum", "mean", "max"])

results = groups[ ["age", "score"] ].agg(["sum", "mean", "max"])
results 

Unnamed: 0_level_0,age,age,age,score,score,score
Unnamed: 0_level_1,sum,mean,max,sum,mean,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Aachen,120,40.0,40,228,76.0,77
Berlin,120,40.0,40,240,80.0,90
Frankfurt,120,40.0,40,216,72.0,73
Stuttgart,80,40.0,40,157,78.5,79


In [123]:
# @Daniel Kübler
def success_status(scores):
    result = scores.mean()
    return "bestanden"  if result >= 75 else "durchgfallen"


results = groups["score"].agg(["mean", success_status])


results = groups["score"].agg(["mean", "max", success_status])
results

Unnamed: 0_level_0,mean,max,success_status
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aachen,76.0,77,bestanden
Berlin,80.0,90,bestanden
Frankfurt,72.0,73,durchgfallen
Stuttgart,78.5,79,bestanden


# Apply

apply functions on rows/columns

In [124]:
df = pd.DataFrame({
    "numbers1" : [10, 20],
    "numbers2" : [100, 200],
            })


def increment(x):
    return x + 50

df2 = df.apply(increment) # give only the reference of the function

df2.head()

Unnamed: 0,numbers1,numbers2
0,60,150
1,70,250


In [125]:
# Alternative : using a lambda function
df3 = df.apply(lambda x: x + 50)
df3.head()

Unnamed: 0,numbers1,numbers2
0,60,150
1,70,250


In [126]:
# Alternative using numpy functions
df4 = df.apply(np.sum, axis = 0) # axis = 0 (sum over the columns)
df4 = df.apply(np.sum, axis = 1) # axis = 1 (sum over the rows)
df4

0    110
1    220
dtype: int64

# Applymap

it applys functions on the elements it self (element-wise)

In [127]:
df = pd.DataFrame({
    "name": ["Thomas", "Ingo", "Sara"],
    "city": ["berLin", "hamBurG", "stuttGART"],
})

df_upper = df.applymap(str.upper)

df_upper.head()


Unnamed: 0,name,city
0,THOMAS,BERLIN
1,INGO,HAMBURG
2,SARA,STUTTGART


# DateTime in Pandas

MM/DD

In [128]:
# Create DaTaFrame
df = pd.DataFrame({
    "date":["03/15/2023","a/16/2023","03/17/2023"],
    "score": [30,35,21]
})




df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3 non-null      object
 1   score   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [129]:
# Convert to real DateTime column
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# df["date"] =  pd.to_datetime(df["date"])
# df["date"] =  pd.to_datetime(df["date"], dayfirst= True) # DD MM --> Make the first arg is the day
# df["date"] =  pd.to_datetime(df["date"], format = "%Y-%m-%d %H:%M:%S") 

# Conversion Errors
#~~~~~~~~~~~~~~~~~~~~
# df["date"] =  pd.to_datetime(df["date"], errors = "ignore") # If there is an error -> Ignore for the cell
df["date"] =  pd.to_datetime(df["date"], errors = "coerce") # Force an invalid Value to NAT


df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2 non-null      datetime64[ns]
 1   score   3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 176.0 bytes


## Datetime Attributes

In [157]:
# Create DaTaFrame
df = pd.DataFrame({
    "date":["03/15/2023","03/16/2023","03/17/2023"],
    "score": [30,35,21]
})

# Convert
df["date"] =  pd.to_datetime(df["date"])

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   score   3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 176.0 bytes


Unnamed: 0,date,score
0,2023-03-15,30
1,2023-03-16,35
2,2023-03-17,21


In [158]:
# Create new columns via parsing from DateTime Column
df["year"] =  df["date"].dt.year
df["month"] =  df["date"].dt.month
df["day"] =  df["date"].dt.day

df["week"] =  df["date"].dt.week
df["quarter"] =  df["date"].dt.quarter
df["weekday"] =  df["date"].dt.weekday
df["is_leap_year"] =  df["date"].dt.is_leap_year


df["month_name"] =  df["date"].dt.month_name()
df["day_name"] =  df["date"].dt.day_name()


df.head()

  df["week"] =  df["date"].dt.week


Unnamed: 0,date,score,year,month,day,week,quarter,weekday,is_leap_year,month_name,day_name
0,2023-03-15,30,2023,3,15,11,1,2,False,March,Wednesday
1,2023-03-16,35,2023,3,16,11,1,3,False,March,Thursday
2,2023-03-17,21,2023,3,17,11,1,4,False,March,Friday


In [159]:
# Map new data to DateTime

date_mapping = {
    0 : "Montag",
    1: "Dienstag",
    2: "Mittwoch",
    3: "Donnerstag",
    4: "Freitag",
    5: "Samstag",
    6: "Sonntag"    
}

# Map the values from Pandas to my dictionary
df["day_of_week_de"]= df["date"].dt.day_of_week.map(date_mapping)

df.head()

Unnamed: 0,date,score,year,month,day,week,quarter,weekday,is_leap_year,month_name,day_name,day_of_week_de
0,2023-03-15,30,2023,3,15,11,1,2,False,March,Wednesday,Mittwoch
1,2023-03-16,35,2023,3,16,11,1,3,False,March,Thursday,Donnerstag
2,2023-03-17,21,2023,3,17,11,1,4,False,March,Friday,Freitag


## DateTime functions

In [168]:
df["date"].min()
df["date"].max()
df["date"].mean()
df["date"].median()

# Search for records where the date is between
df.loc[ df["date"].between("2023-03-10", "2023-03-16") ]


Unnamed: 0,date,score,year,month,day,week,quarter,weekday,is_leap_year,month_name,day_name,day_of_week_de
0,2023-03-15,30,2023,3,15,11,1,2,False,March,Wednesday,Mittwoch
1,2023-03-16,35,2023,3,16,11,1,3,False,March,Thursday,Donnerstag


## Read DateTime directly by creating the DataFrame (ex. CSV File)

In [153]:
# Bad Way --> During creating DataFrame -> Parse + convert the DateTime Columns to the correct data type
df = pd.read_csv("./my_data.csv", parse_dates= ["Order Date"])

# ! Achtung: Dataset has the duplicated header -> therefore the Datetime is not converted
# Alternative
#~~~~~~~~~~~~~~~
df = pd.read_csv("./my_data.csv")
df.drop_duplicates(inplace= True) # Drop Duplicated Header also
df["Order Date"] = pd.to_datetime(df["Order Date"], errors = "coerce")


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185688 entries, 0 to 186849
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Order ID          185687 non-null  object        
 1   Product           185687 non-null  object        
 2   Quantity Ordered  185687 non-null  object        
 3   Price Each        185687 non-null  object        
 4   Order Date        185686 non-null  datetime64[ns]
 5   Purchase Address  185687 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 9.9+ MB
