# Pandas: Selektieren, Slicen, Filtern

### Beispiel-Datensätze: `seaborn` Paket

`seaborn` ist ein wichtiges package, mit dem man vor allem Daten visualisiert.
Wir werden im weiteren Kursverlauf auch darauf weiter eingehen. Für den Moment wollen wir uns aber nur bei den Beispieldatensätzen bedienen, die es mitliefert.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# seaborn hat einige Datensätze zur Auswahl, hier die Liste
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [3]:
# Laden eines Datensatzes:
peng_df = sns.load_dataset("penguins")
peng_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [4]:
# Datensatz besteht aus 344 Zeilen.
# Wollen wir nur einen Teil davon sehen, nutzen wir
# .head(x) (zeigt nur die ersten x Zeilen an)
# Beispiel mit 8 Zeilen:
peng_df.head(8)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male


In [5]:
# Standard (default value) sind übrigens 5 Zeilen:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [6]:
# oder .tail(x) (zeigt nur die unteren x Zeilen an)
# auch hier ist 5 Standardwert:
peng_df.tail()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female
343,Gentoo,Biscoe,49.9,16.1,213.0,5400.0,Male


In [7]:
# Dieses Mal gibt es mehr zu beschreiben:
peng_df.describe()
# NaNs fließen nicht in die Berechnung ein (deswegen count: 342)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [8]:
# Welche Form hat mein DataFrame? (Zeilen, Spalten)
peng_df.shape

(344, 7)

In [9]:
# Wie viele Einträge gibt es insgesamt in df?
# --> Einträge = Zeilen * Spalten
peng_df.size

2408

In [10]:
# Welche Dimension hat mein DataFrame?
peng_df.ndim

2

In [12]:
# Welche Datentypen stecken in den Spalten?
peng_df.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

### DataFrame Indexing und Bearbeitung

In [15]:
# Spalten können als Index oder Instanzen-Attribut abgerufen werden
print(peng_df['species'])
print()
print(peng_df.species)
# Wir werden im Weiteren die letztere Schreibweise ignorieren,
# da sie weniger Flexibilität bietet.

0      Adelie
1      Adelie
2      Adelie
3      Adelie
4      Adelie
        ...  
339    Gentoo
340    Gentoo
341    Gentoo
342    Gentoo
343    Gentoo
Name: species, Length: 344, dtype: object

0      Adelie
1      Adelie
2      Adelie
3      Adelie
4      Adelie
        ...  
339    Gentoo
340    Gentoo
341    Gentoo
342    Gentoo
343    Gentoo
Name: species, Length: 344, dtype: object


In [16]:
# Welchen Datentyp hat unsere Spalte?
type(peng_df['species'])

pandas.core.series.Series

In [17]:
# Wie sieht der df momentan aus?
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [23]:
# Einfachste Art Spalte hinzuzufügen mit Konstante:
peng_df['just_for_fun'] = 666

In [24]:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive,just_for_fun
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,666
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,666
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,666
3,Adelie,Torgersen,,,,,,1,666
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,666


In [None]:
# Mit len kriegt man die Anzahl der Zeilen:
len(peng_df)

344

In [20]:
np.random.randint(0, 2, len(peng_df))

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,

In [None]:
# Wenn wir für alle Zeilen unterschiedliche Werte eintragen wollen,
# müssen wir so viele Werte wie Zeilen liefern
peng_df['alive'] = np.random.randint(0, 2, len(peng_df))

In [22]:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
3,Adelie,Torgersen,,,,,,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1


In [None]:
# Wollen wir die Spalte an eine bestimmte Stelle einfügen, dann nutzen wir insert.
# Dazu an anderer Stelle mehr.

In [25]:
# Checken, welche Werte in der alive-Spalte vorkommen:
peng_df['alive'].unique()

array([0, 1])

In [30]:
# Jetzt seid ihr dran: Wie prüfe ich, welche Spezies / Inseln / Geschlechter vorkommen?
print(peng_df['species'].unique())
print(peng_df['island'].unique())
print(peng_df['sex'].unique())

['Adelie' 'Chinstrap' 'Gentoo']
['Torgersen' 'Biscoe' 'Dream']
['Male' 'Female' nan]


In [28]:
# Will man nur wissen, wie viele einzigartige Werte vorkommen, dann hilft
# nunique:
peng_df['island'].nunique()

3

In [None]:
# NaNs automatisch weggelassen (bei Bedarf dropna=False)
peng_df.sex.nunique()

2

In [None]:
# Möchte man wissen, wie viele Einträge pro Label vorkommen (absolute Zahlen), 
# dann hilft value_counts():
peng_df['sex'].value_counts()

Male      168
Female    165
Name: sex, dtype: int64

In [None]:
# Bei numerischen Daten meist nicht so sinnvoll:
peng_df['flipper_length_mm'].value_counts()

190.0    22
195.0    17
187.0    16
193.0    15
210.0    14
191.0    13
215.0    12
197.0    10
196.0    10
185.0     9
220.0     8
198.0     8
208.0     8
216.0     8
212.0     7
186.0     7
181.0     7
189.0     7
230.0     7
192.0     7
184.0     7
199.0     6
213.0     6
188.0     6
214.0     6
217.0     6
222.0     6
201.0     6
219.0     5
209.0     5
218.0     5
221.0     5
203.0     5
194.0     5
180.0     5
178.0     4
225.0     4
228.0     4
202.0     4
200.0     4
182.0     3
224.0     3
205.0     3
229.0     2
183.0     2
207.0     2
223.0     2
211.0     2
231.0     1
206.0     1
174.0     1
172.0     1
179.0     1
176.0     1
226.0     1
Name: flipper_length_mm, dtype: int64

In [36]:
peng_df['island'].value_counts()

Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64

In [38]:
# Es wird noch besser: value_counts kann direkt die Anteile ausrechnen!
peng_df['species'].value_counts(normalize=True)

Adelie       0.441860
Gentoo       0.360465
Chinstrap    0.197674
Name: species, dtype: float64

In [45]:
# Abruf von mehreren Spalten mit einem Befehl
# erfordert eine Liste innerhalb der []:
selected_df = peng_df[['species', 'island', 'body_mass_g']]
# Expliziter: selected_df = peng_df.loc[:, ['species', 'island', 'body_mass_g']]
selected_df.head()

Unnamed: 0,species,island,body_mass_g
0,Adelie,Torgersen,3750.0
1,Adelie,Torgersen,3800.0
2,Adelie,Torgersen,3250.0
3,Adelie,Torgersen,
4,Adelie,Torgersen,3450.0


In [46]:
# Abruf von mehreren Spalten mit begrenzter Anzahl an Zeilen
# und nur jede zweite Zeile
# Achtung: Ende ist inklusiv!
peng_df.loc[:20:2, ['species', 'island', 'body_mass_g']]

Unnamed: 0,species,island,body_mass_g
0,Adelie,Torgersen,3750.0
2,Adelie,Torgersen,3250.0
4,Adelie,Torgersen,3450.0
6,Adelie,Torgersen,3625.0
8,Adelie,Torgersen,3475.0
10,Adelie,Torgersen,3300.0
12,Adelie,Torgersen,3200.0
14,Adelie,Torgersen,4400.0
16,Adelie,Torgersen,3450.0
18,Adelie,Torgersen,3325.0


In [None]:
# Weil Felix das unbedingt wollte (kann bei bekannten spezifischen Indizes super nützlich sein):
peng_df.loc[[100, 200, 300], ['species', 'island', 'body_mass_g']]

Unnamed: 0,species,island,body_mass_g
100,Adelie,Biscoe,3725.0
200,Chinstrap,Dream,3250.0
300,Gentoo,Biscoe,4625.0


In [None]:
# Normalerweise ist der Endindex beim Programmieren exklusiv, etwa bei einer Liste

In [48]:
peng_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive,just_for_fun
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,666
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,666
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,666
3,Adelie,Torgersen,,,,,,1,666
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,666
...,...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,,0,666
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,1,666
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,0,666
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,1,666


In [51]:
# Mit .iloc ist es auch in der Tat exklusiv!
peng_df.iloc[50:60, 1:6]

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
50,Biscoe,39.6,17.7,186.0,3500.0
51,Biscoe,40.1,18.9,188.0,4300.0
52,Biscoe,35.0,17.9,190.0,3450.0
53,Biscoe,42.0,19.5,200.0,4050.0
54,Biscoe,34.5,18.1,187.0,2900.0
55,Biscoe,41.4,18.6,191.0,3700.0
56,Biscoe,39.0,17.5,186.0,3550.0
57,Biscoe,40.6,18.8,193.0,3800.0
58,Biscoe,36.5,16.6,181.0,2850.0
59,Biscoe,37.6,19.1,194.0,3750.0


In [53]:
# Alle numerischen Spalten holen (umständlich):
peng_df[['bill_length_mm', 'bill_depth_mm', 
         'flipper_length_mm', 'body_mass_g', 'alive', 'just_for_fun']]

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,alive,just_for_fun
0,39.1,18.7,181.0,3750.0,0,666
1,39.5,17.4,186.0,3800.0,1,666
2,40.3,18.0,195.0,3250.0,0,666
3,,,,,1,666
4,36.7,19.3,193.0,3450.0,1,666
...,...,...,...,...,...,...
339,,,,,0,666
340,46.8,14.3,215.0,4850.0,1,666
341,50.4,15.7,222.0,5750.0,0,666
342,45.2,14.8,212.0,5200.0,1,666


In [None]:
# Mit select_dtypes können wir bequem nur den Teil des DataFrames herausgreifen,
# der Objekte des gewählten Typs enthält!
# Number gibt uns alles Numerische her:
peng_df.select_dtypes('number')

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,alive,just_for_fun
0,39.1,18.7,181.0,3750.0,0,666
1,39.5,17.4,186.0,3800.0,1,666
2,40.3,18.0,195.0,3250.0,0,666
3,,,,,1,666
4,36.7,19.3,193.0,3450.0,1,666
...,...,...,...,...,...,...
339,,,,,0,666
340,46.8,14.3,215.0,4850.0,1,666
341,50.4,15.7,222.0,5750.0,0,666
342,45.2,14.8,212.0,5200.0,1,666


In [56]:
# Spezifischer nur Floats:
peng_df.select_dtypes('float')

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,,,,
4,36.7,19.3,193.0,3450.0
...,...,...,...,...
339,,,,
340,46.8,14.3,215.0,4850.0
341,50.4,15.7,222.0,5750.0
342,45.2,14.8,212.0,5200.0


In [57]:
# Nur Ints:
peng_df.select_dtypes('int')

Unnamed: 0,alive,just_for_fun
0,0,666
1,1,666
2,0,666
3,1,666
4,1,666
...,...,...
339,0,666
340,1,666
341,0,666
342,1,666


In [58]:
# Bsp. object führt hier zu allen Spalten, die Strings enthalten (auch object ist weiter gefasst!):
peng_df.select_dtypes('object')

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,Male
1,Adelie,Torgersen,Female
2,Adelie,Torgersen,Female
3,Adelie,Torgersen,
4,Adelie,Torgersen,Female
...,...,...,...
339,Gentoo,Biscoe,
340,Gentoo,Biscoe,Female
341,Gentoo,Biscoe,Male
342,Gentoo,Biscoe,Female


In [None]:
# Geht auch mit Datetime-Objekten, vielleicht später mehr dazu.

In [None]:
# Verkettung ist leicht möglich:
peng_df.select_dtypes('object')\
       .describe()
# Bei längerer Verkettung ist \ üblich.

Unnamed: 0,species,island,sex
count,344,344,333
unique,3,3,2
top,Adelie,Biscoe,Male
freq,152,168,168


#### Übungsaufgabe mit DataFrame
Lade den Titanic-Datensatz und löse die nachfolgenden Aufgabenstellungen<br>
Zeit: 20 Minuten<br><br>
Tipp: Wenn du bei einer Aufgabe einfach nicht weiter weißt, geh erstmal zur nächsten.
<hr>


In [None]:
# Führe diesen Code aus, um den Titanic-Datensatz zu laden:
titanic_df = sns.load_dataset("titanic")

In [None]:
# Schaue dir die obersten zehn Einträge an:
titanic_df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [None]:
# Wie viele Zeilen und Spalten umfasst der Datensatz?
titanic_df.shape

(891, 15)

In [None]:
# Wie viele Elemente / Daten enthält der Datensatz?
titanic_df.size

13365

In [None]:
# Lass dir eine statistische Zusammenfassung 
# der Altersverteilung auf der Titanic anzeigen:
titanic_df['age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [None]:
# Wie viele Menschen (absolute Zahlen) sind gestorben oder haben überlebt? 
# Zeige mit Pandas-Mitteln:
titanic_df['alive'].value_counts()

no     549
yes    342
Name: alive, dtype: int64

In [None]:
# Wie hoch waren die Anteile der verschiedenen Klassen auf der Titanic? 
# Zeige mit Pandas-Mitteln:
titanic_df['class'].value_counts(normalize=True) #  * 100

Third     0.551066
First     0.242424
Second    0.206510
Name: class, dtype: float64

In [None]:
# Welche Städte (mit Namen) gab es, an denen Passagiere zugestiegen sind?
titanic_df['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [None]:
# Was hat ein Ticket im Schnitt gekostet? 
# Was hat das teuerste, was das günstigste gekostet?
print(titanic_df['fare'].mean())
print(titanic_df['fare'].min())
print(titanic_df['fare'].max())

32.204207968574636
0.0
512.3292


In [None]:
# Wähle einen Ausschnitt aus dem DataFrame aus, der Folgendes umfasst: 
# Spalten 'survived', 'sex', 'adult_male' und 'class' sowie die ersten 250 Zeilen.
titanic_df.loc[:250, ['survived', 'sex', 'adult_male', 'class']]

Unnamed: 0,survived,sex,adult_male,class
0,0,male,True,Third
1,1,female,False,First
2,1,female,False,Third
3,1,female,False,First
4,0,male,True,Third
...,...,...,...,...
246,0,female,False,Third
247,1,female,False,Second
248,1,male,True,First
249,0,male,True,Second


In [None]:
# Szenario: Für die weitere Analyse brauchst du keine Strings.
# Wie wählst du nur die numerischen Spalten des Dataframes aus?
titanic_df.select_dtypes('number')

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.2500
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.9250
3,1,1,35.0,1,0,53.1000
4,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000
887,1,1,19.0,0,0,30.0000
888,0,3,,1,2,23.4500
889,1,1,26.0,0,0,30.0000


In [None]:
### Ende der Übung!

### Löschen und Umbenennen


In [63]:
# Zurück zu den Pinguinen:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive,just_for_fun
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,666
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,666
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,666
3,Adelie,Torgersen,,,,,,1,666
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,666


In [None]:
# Spalten löschen mit del:
del peng_df['just_for_fun']  
# Mehrfaches Ausführen erzeugt KeyError!

KeyError: 'just_for_fun'

In [65]:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
3,Adelie,Torgersen,,,,,,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1


In [None]:
# Typischer: Zeilen und/oder Spalten löschen mit .drop()
# Warum klappt das nicht einfach mit Spaltennamen?
peng_df.drop('alive', axis=1, inplace=True)
# Wir müssen die Achse festlegen, sonst sucht Pandas nach 'alive' in den Zeilen!

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [69]:
# Aber: df unverändert nach .drop()
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,alive
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0
3,Adelie,Torgersen,,,,,,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1


In [None]:
# Was wir oben sahen, ist eine Kopie des DataFrames.
# Damit der ursprünglich DF damit überschrieben wird, müssen wir inplace auf True setzen:
peng_df.drop('alive', axis=1, inplace=True)

# Alternative:
# peng_df = peng_df.drop('alive', axis=1)

In [71]:
# Jetzt passt es:
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [72]:
# Wegwerfspalten erstellen:
peng_df['Wegwerfspalte1'] = -999
peng_df['Wegwerfspalte2'] = -999
peng_df['Wegwerfspalte3'] = -999
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Wegwerfspalte1,Wegwerfspalte2,Wegwerfspalte3
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,-999,-999,-999
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,-999,-999,-999
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,-999,-999,-999
3,Adelie,Torgersen,,,,,,-999,-999,-999
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,-999,-999,-999


In [None]:
# Mit Listen möglich:
peng_df.drop(['Wegwerfspalte1', 'Wegwerfspalte2', 'Wegwerfspalte3'],
             axis=1, inplace=True)

peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [74]:
# Wegwerfspalten erstellen:
peng_df['Wegwerfspalte1'] = -999
peng_df['Wegwerfspalte2'] = -999
peng_df['Wegwerfspalte3'] = -999
peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Wegwerfspalte1,Wegwerfspalte2,Wegwerfspalte3
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,-999,-999,-999
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,-999,-999,-999
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,-999,-999,-999
3,Adelie,Torgersen,,,,,,-999,-999,-999
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,-999,-999,-999


In [75]:
# Ohne axis auch über columns-Parameter möglich:
peng_df.drop(columns=['Wegwerfspalte1', 'Wegwerfspalte2', 'Wegwerfspalte3'],
             inplace=True)

peng_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [78]:
# Die erste fünf Indexeinträge wegschmeißen über axis 0:
peng_df.drop([0, 1, 2, 3, 4])

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [80]:
# Die ersten fünf Zeilen wegschmeißen mit Parameter index und dann ohne axis
# + Spalten:
peng_df.drop(index=[0, 1, 2, 3, 4],
             columns=['species', 'sex'])

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
5,Torgersen,39.3,20.6,190.0,3650.0
6,Torgersen,38.9,17.8,181.0,3625.0
7,Torgersen,39.2,19.6,195.0,4675.0
8,Torgersen,34.1,18.1,193.0,3475.0
9,Torgersen,42.0,20.2,190.0,4250.0
...,...,...,...,...,...
339,Biscoe,,,,
340,Biscoe,46.8,14.3,215.0,4850.0
341,Biscoe,50.4,15.7,222.0,5750.0
342,Biscoe,45.2,14.8,212.0,5200.0


In [83]:
# Zeilen / Spalten umbenennen
translation_dict = {'bill_length_mm': 'Schnabellaenge_mm',
                    'bill_depth_mm': 'Schnabelbreite_mm',
                    'flipper_length_mm': 'Fluegellaenge_mm'}

peng_df.rename(columns=translation_dict, inplace=True)

In [84]:
peng_df.head()

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [None]:
# Geht auch mit Indices. Nur dann index= ... ausfüllen

### Filtern von DataFrames

##### Mit Methode `filter` (good to know)
Über Zeilen oder Spalten mit einem Filter gehen

In [86]:
# Filter von Spalten mit Namensbestandteilen
peng_df.filter(like='Schnabel', axis=1)

Unnamed: 0,Schnabellaenge_mm,Schnabelbreite_mm
0,39.1,18.7
1,39.5,17.4
2,40.3,18.0
3,,
4,36.7,19.3
...,...,...
339,,
340,46.8,14.3
341,50.4,15.7
342,45.2,14.8


In [87]:
# KI-generierte falsche IBANs:
data = {
    'Name': [
        'John Doe', 'Alice Müller', 'Max Mustermann', 'Maria Schmidt', 'Lukas Weber', 
        'Anna Bauer', 'Paul Klein', 'Julia Richter', 'Stefan Maier', 'Katrin Hoffmann',
        'Timo Schmitt', 'Laura Fischer', 'David Braun', 'Hannah Wagner', 'Sebastian Klein',
        'Felix Schuster', 'Eva Lange', 'Simon Meyer', 'Clara Zimmermann', 'Nico Becker'
    ],
    'Country': [
        'Germany', 'Austria', 'Germany', 'Luxembourg', 'Austria', 
        'Germany', 'Luxembourg', 'Austria', 'Germany', 'Luxembourg',
        'Germany', 'Austria', 'Germany', 'Austria', 'Luxembourg',
        'Germany', 'Austria', 'Germany', 'Luxembourg', 'Austria'
    ],
    'IBAN': [
        'DE89370400440532013000',  # Germany
        'AT611904300234573201',  # Austria
        'DE12500105170648489890',  # Germany
        'LU280019400644750000',  # Luxembourg
        'AT611904300234573202',  # Austria
        'DE25500105175665137000',  # Germany
        'LU280019400644750001',  # Luxembourg
        'AT611904300234573203',  # Austria
        'DE12500105170648489891',  # Germany
        'LU280019400644750002',  # Luxembourg
        'DE89500105175665137100',  # Germany
        'AT611904300234573204',  # Austria
        'DE17500105170648489892',  # Germany
        'AT611904300234573205',  # Austria
        'LU280019400644750003',  # Luxembourg
        'DE15500105175665137200',  # Germany
        'AT611904300234573206',  # Austria
        'DE89500105170648489893',  # Germany
        'LU280019400644750004',  # Luxembourg
        'AT611904300234573207'   # Austria
    ]
}

# Construct the DataFrame
accounts_df = pd.DataFrame(data)

print(accounts_df)

                Name     Country                    IBAN
0           John Doe     Germany  DE89370400440532013000
1       Alice Müller     Austria    AT611904300234573201
2     Max Mustermann     Germany  DE12500105170648489890
3      Maria Schmidt  Luxembourg    LU280019400644750000
4        Lukas Weber     Austria    AT611904300234573202
5         Anna Bauer     Germany  DE25500105175665137000
6         Paul Klein  Luxembourg    LU280019400644750001
7      Julia Richter     Austria    AT611904300234573203
8       Stefan Maier     Germany  DE12500105170648489891
9    Katrin Hoffmann  Luxembourg    LU280019400644750002
10      Timo Schmitt     Germany  DE89500105175665137100
11     Laura Fischer     Austria    AT611904300234573204
12       David Braun     Germany  DE17500105170648489892
13     Hannah Wagner     Austria    AT611904300234573205
14   Sebastian Klein  Luxembourg    LU280019400644750003
15    Felix Schuster     Germany  DE15500105175665137200
16         Eva Lange     Austri

In [88]:
# Schauen wir uns später genauer an, ernennt aber eine Spalte zum Index:
accounts_df.set_index('IBAN', inplace=True)
accounts_df.head()

Unnamed: 0_level_0,Name,Country
IBAN,Unnamed: 1_level_1,Unnamed: 2_level_1
DE89370400440532013000,John Doe,Germany
AT611904300234573201,Alice Müller,Austria
DE12500105170648489890,Max Mustermann,Germany
LU280019400644750000,Maria Schmidt,Luxembourg
AT611904300234573202,Lukas Weber,Austria


In [97]:
# Nun filtern wir alle deutschen IBANs heraus!
accounts_df.filter(like='DE', axis=0)

Unnamed: 0_level_0,Name,Country
IBAN,Unnamed: 1_level_1,Unnamed: 2_level_1
DE89370400440532013000,John Doe,Germany
DE12500105170648489890,Max Mustermann,Germany
DE25500105175665137000,Anna Bauer,Germany
DE12500105170648489891,Stefan Maier,Germany
DE89500105175665137100,Timo Schmitt,Germany
DE17500105170648489892,David Braun,Germany
DE15500105175665137200,Felix Schuster,Germany
DE89500105170648489893,Simon Meyer,Germany


--> Filter wird nur auf die Spalten- oder Zeilen-Label verwendet, nicht auf die Inhalte der Tabelle.

In [None]:
# Hier könnte Ihre Mini-Pause stattfinden! (sofern die Zeit es erlaubt)

#### Über Bedingungen: Conditional Slicing mit Wahrheitsmasken (important to know)

In [99]:
# Wir wollen den Ausschnitt des DataFrames für die Spezies 'Adelie' haben:
# Mit dieser Wahrheitsmaske werden wir unser Ziel erreichen.
# Bei den Zeilen, in denen Adelies vorkommen steht ein True:
peng_df['species'] == 'Adelie'

0       True
1       True
2       True
3       True
4       True
       ...  
339    False
340    False
341    False
342    False
343    False
Name: species, Length: 344, dtype: bool

In [101]:
# Damit slicen wir nun den gesamten DataFrame!
adelies_only = peng_df[peng_df['species'] == 'Adelie']
adelies_only

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
147,Adelie,Dream,36.6,18.4,184.0,3475.0,Female
148,Adelie,Dream,36.0,17.8,195.0,3450.0,Female
149,Adelie,Dream,37.8,18.1,193.0,3750.0,Male
150,Adelie,Dream,36.0,17.1,187.0,3700.0,Female


In [102]:
# Wie testen wir, ob wirklich nur Adelies vorkommen?
adelies_only['species'].unique()

array(['Adelie'], dtype=object)

In [103]:
# Wir wollen alle Informationen haben, aber nur für
# Pinguine mit Schnabellänge > 40
peng_df['Schnabellaenge_mm'] > 40

0      False
1      False
2       True
3      False
4      False
       ...  
339    False
340     True
341     True
342     True
343     True
Name: Schnabellaenge_mm, Length: 344, dtype: bool

In [104]:
peng_df[peng_df['Schnabellaenge_mm'] > 40]

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
12,Adelie,Torgersen,41.1,17.6,182.0,3200.0,Female
17,Adelie,Torgersen,42.5,20.7,197.0,4500.0,Male
19,Adelie,Torgersen,46.0,21.5,194.0,4200.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [105]:
# Wir wollen nur die species wissen, von Pinguinen mit 
# einer body_mass < 3000
peng_df[peng_df['body_mass_g'] < 3000]

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
47,Adelie,Dream,37.5,18.9,179.0,2975.0,
54,Adelie,Biscoe,34.5,18.1,187.0,2900.0,Female
58,Adelie,Biscoe,36.5,16.6,181.0,2850.0,Female
64,Adelie,Biscoe,36.4,17.1,184.0,2850.0,Female
98,Adelie,Dream,33.1,16.1,178.0,2900.0,Female
104,Adelie,Biscoe,37.9,18.6,193.0,2925.0,Female
116,Adelie,Torgersen,38.6,17.0,188.0,2900.0,Female
174,Chinstrap,Dream,43.2,16.6,187.0,2900.0,Female
190,Chinstrap,Dream,46.9,16.6,192.0,2700.0,Female


In [106]:
# Kurz über Flügellängen schauen:
peng_df['Fluegellaenge_mm'].describe()

count    342.000000
mean     200.915205
std       14.061714
min      172.000000
25%      190.000000
50%      197.000000
75%      213.000000
max      231.000000
Name: Fluegellaenge_mm, dtype: float64

In [107]:
# Uns interessieren nur Pinguine mit Flügellänge 190 - 214
peng_df['Fluegellaenge_mm'].between(190, 214)  # Standard: Anfang und Ende sind inklusive

0      False
1      False
2       True
3      False
4       True
       ...  
339    False
340    False
341    False
342     True
343     True
Name: Fluegellaenge_mm, Length: 344, dtype: bool

In [109]:
# Slicen:
mid_pengs = peng_df[peng_df['Fluegellaenge_mm'].between(190, 214)]
mid_pengs

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
...,...,...,...,...,...,...,...
329,Gentoo,Biscoe,48.1,15.1,209.0,5500.0,Male
332,Gentoo,Biscoe,43.5,15.2,213.0,4650.0,Female
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [110]:
mid_pengs.sort_values(by='Fluegellaenge_mm')

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
74,Adelie,Torgersen,35.5,17.5,190.0,3700.0,Female
71,Adelie,Torgersen,39.7,18.4,190.0,3900.0,Male
170,Chinstrap,Dream,46.4,18.6,190.0,3450.0,Female
166,Chinstrap,Dream,45.9,17.1,190.0,3575.0,Female
52,Adelie,Biscoe,35.0,17.9,190.0,3450.0,Female
...,...,...,...,...,...,...,...
314,Gentoo,Biscoe,44.5,14.7,214.0,4850.0,Female
230,Gentoo,Biscoe,40.9,13.7,214.0,4650.0,Female
232,Gentoo,Biscoe,45.5,13.7,214.0,4650.0,Female
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female


In [112]:
sorted_peng = mid_pengs.sort_values(by='Fluegellaenge_mm', ascending=False)
sorted_peng

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
282,Gentoo,Biscoe,45.7,13.9,214.0,4400.0,Female
314,Gentoo,Biscoe,44.5,14.7,214.0,4850.0,Female
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
232,Gentoo,Biscoe,45.5,13.7,214.0,4650.0,Female
230,Gentoo,Biscoe,40.9,13.7,214.0,4650.0,Female
...,...,...,...,...,...,...,...
166,Chinstrap,Dream,45.9,17.1,190.0,3575.0,Female
68,Adelie,Torgersen,35.9,16.6,190.0,3050.0,Female
74,Adelie,Torgersen,35.5,17.5,190.0,3700.0,Female
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,Female


In [113]:
# Kurz ein weiteres Dataset für weitere Beispiele:
health_df = sns.load_dataset('healthexp')
health_df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [117]:
# Für Deutschland und Frankreich (oder):
health_df[(health_df['Country'] == 'Germany') | (health_df['Country'] == 'France')]

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
6,1971,Germany,298.251,70.8
10,1972,Germany,337.364,71.0
13,1973,Germany,384.541,71.3
...,...,...,...,...
258,2018,France,5099.306,82.8
263,2019,Germany,6407.928,81.3
264,2019,France,5167.839,82.9
269,2020,Germany,6938.983,81.1


In [119]:
# Für Deutschland und mit Jahr ab 2000:
germany_from_2000 = health_df[(health_df['Country'] == 'Germany') & (health_df['Year'] >= 2000)]
germany_from_2000

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
149,2000,Germany,2895.533,78.2
155,2001,Germany,3009.368,78.5
161,2002,Germany,3239.77,78.5
167,2003,Germany,3329.374,78.6
173,2004,Germany,3391.521,79.2
179,2005,Germany,3429.955,79.4
185,2006,Germany,3567.061,79.8
191,2007,Germany,3750.787,80.1
197,2008,Germany,3955.136,80.2
203,2009,Germany,4158.266,80.3


In [122]:
# Negation (das not im Slicing) ~
# Hier alle mit Ausnahme von Deutschland:
not_germany = health_df[~(health_df['Country'] == 'Germany')]
not_germany

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
5,1971,Canada,313.391,72.8
...,...,...,...,...
268,2020,Canada,5828.324,81.7
270,2020,France,5468.418,82.3
271,2020,Great Britain,5018.700,80.4
272,2020,Japan,4665.641,84.7


In [123]:
not_germany['Country'].unique()

array(['France', 'Great Britain', 'Japan', 'USA', 'Canada'], dtype=object)

In [124]:
'Germany' in not_germany['Country'].unique()

False

In [125]:
# Für eins der Länder: 'Great Britain', 'Japan', 'USA', 'Canada'
# Zunächst einmal Wahrheitsmaske zeigen:
countries = ['Great Britain', 'Japan', 'USA', 'Canada']

In [127]:
health_df.head()

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9


In [126]:
health_df['Country'].isin(countries)

0      False
1      False
2       True
3       True
4       True
       ...  
269    False
270    False
271     True
272     True
273     True
Name: Country, Length: 274, dtype: bool

In [128]:
# Das eigentliche Slicen:
health_df[health_df['Country'].isin(countries)]

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
5,1971,Canada,313.391,72.8
7,1971,Great Britain,134.172,71.9
...,...,...,...,...
267,2019,USA,10855.517,78.8
268,2020,Canada,5828.324,81.7
271,2020,Great Britain,5018.700,80.4
272,2020,Japan,4665.641,84.7


In [None]:
# Bisschen Refactoring:
is_germany = health_df['Country'] == 'Germany'
starting_from_2000 = health_df['Year'] >= 2000

health_df[is_germany & starting_from_2000]

#### Sortieren mit `sort_values` und `sort_index`

In [132]:
peng_df

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [130]:
peng_df.sort_index(ascending=False)

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
343,Gentoo,Biscoe,49.9,16.1,213.0,5400.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
339,Gentoo,Biscoe,,,,,
...,...,...,...,...,...,...,...
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
3,Adelie,Torgersen,,,,,
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


In [None]:
# Standardverhalten von Sortieren ist ascending=True, also aufsteigend.
# Wollen wir das anders, müssen wir explizit ascending auf False setzen:
peng_df.sort_values('Schnabellaenge_mm', ascending=False)

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
253,Gentoo,Biscoe,59.6,17.0,230.0,6050.0,Male
169,Chinstrap,Dream,58.0,17.8,181.0,3700.0,Female
321,Gentoo,Biscoe,55.9,17.0,228.0,5600.0,Male
215,Chinstrap,Dream,55.8,19.8,207.0,4000.0,Male
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0,Male
...,...,...,...,...,...,...,...
70,Adelie,Torgersen,33.5,19.0,190.0,3600.0,Female
98,Adelie,Dream,33.1,16.1,178.0,2900.0,Female
142,Adelie,Dream,32.1,15.5,188.0,3050.0,Female
3,Adelie,Torgersen,,,,,


In [None]:
# Wenn wir einen Index zurücksetzen wollen, wie den hier:
accounts_df

Unnamed: 0_level_0,Name,Country
IBAN,Unnamed: 1_level_1,Unnamed: 2_level_1
DE89370400440532013000,John Doe,Germany
AT611904300234573201,Alice Müller,Austria
DE12500105170648489890,Max Mustermann,Germany
LU280019400644750000,Maria Schmidt,Luxembourg
AT611904300234573202,Lukas Weber,Austria
DE25500105175665137000,Anna Bauer,Germany
LU280019400644750001,Paul Klein,Luxembourg
AT611904300234573203,Julia Richter,Austria
DE12500105170648489891,Stefan Maier,Germany
LU280019400644750002,Katrin Hoffmann,Luxembourg


In [None]:
# Können wir mit reset_index wieder einen neuen numerischen Index erzeugen lassen:
accounts_df.reset_index()

Unnamed: 0,IBAN,Name,Country
0,DE89370400440532013000,John Doe,Germany
1,AT611904300234573201,Alice Müller,Austria
2,DE12500105170648489890,Max Mustermann,Germany
3,LU280019400644750000,Maria Schmidt,Luxembourg
4,AT611904300234573202,Lukas Weber,Austria
5,DE25500105175665137000,Anna Bauer,Germany
6,LU280019400644750001,Paul Klein,Luxembourg
7,AT611904300234573203,Julia Richter,Austria
8,DE12500105170648489891,Stefan Maier,Germany
9,LU280019400644750002,Katrin Hoffmann,Luxembourg


In [None]:
# Wenn wir den alten Index dabei wegschmeißen wollen + inplace auf True setzen,
# wenn das Ergebnis "bleiben" soll:
accounts_df.reset_index(drop=True)

Unnamed: 0,Name,Country
0,John Doe,Germany
1,Alice Müller,Austria
2,Max Mustermann,Germany
3,Maria Schmidt,Luxembourg
4,Lukas Weber,Austria
5,Anna Bauer,Germany
6,Paul Klein,Luxembourg
7,Julia Richter,Austria
8,Stefan Maier,Germany
9,Katrin Hoffmann,Luxembourg


In [136]:
accounts_df

Unnamed: 0_level_0,Name,Country
IBAN,Unnamed: 1_level_1,Unnamed: 2_level_1
DE89370400440532013000,John Doe,Germany
AT611904300234573201,Alice Müller,Austria
DE12500105170648489890,Max Mustermann,Germany
LU280019400644750000,Maria Schmidt,Luxembourg
AT611904300234573202,Lukas Weber,Austria
DE25500105175665137000,Anna Bauer,Germany
LU280019400644750001,Paul Klein,Luxembourg
AT611904300234573203,Julia Richter,Austria
DE12500105170648489891,Stefan Maier,Germany
LU280019400644750002,Katrin Hoffmann,Luxembourg


In [137]:
accounts_df.reset_index(inplace=True)

In [138]:
accounts_df

Unnamed: 0,IBAN,Name,Country
0,DE89370400440532013000,John Doe,Germany
1,AT611904300234573201,Alice Müller,Austria
2,DE12500105170648489890,Max Mustermann,Germany
3,LU280019400644750000,Maria Schmidt,Luxembourg
4,AT611904300234573202,Lukas Weber,Austria
5,DE25500105175665137000,Anna Bauer,Germany
6,LU280019400644750001,Paul Klein,Luxembourg
7,AT611904300234573203,Julia Richter,Austria
8,DE12500105170648489891,Stefan Maier,Germany
9,LU280019400644750002,Katrin Hoffmann,Luxembourg


In [140]:
peng_df.drop([0, 1, 2], inplace=True)

In [141]:
peng_df

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [143]:
peng_df.reset_index(drop=True, inplace=True)

In [146]:
peng_df

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
0,Adelie,Torgersen,,,,,
1,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
2,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
3,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
4,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
...,...,...,...,...,...,...,...
336,Gentoo,Biscoe,,,,,
337,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
338,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
339,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [148]:
# Sortieren nach mehreren Inhalten:
peng_df.sort_values(["Fluegellaenge_mm", "Schnabellaenge_mm"]).head(10)

Unnamed: 0,species,island,Schnabellaenge_mm,Schnabelbreite_mm,Fluegellaenge_mm,body_mass_g,sex
25,Adelie,Biscoe,37.9,18.6,172.0,3150.0,Female
17,Adelie,Biscoe,37.8,18.3,174.0,3400.0,Female
119,Adelie,Torgersen,40.2,17.0,176.0,3450.0,Female
95,Adelie,Dream,33.1,16.1,178.0,2900.0,Female
28,Adelie,Dream,37.2,18.1,178.0,3900.0,Male
27,Adelie,Dream,39.5,16.7,178.0,3250.0,Female
155,Chinstrap,Dream,46.1,18.2,178.0,3250.0,Female
44,Adelie,Dream,37.5,18.9,179.0,2975.0,
18,Adelie,Biscoe,37.7,18.7,180.0,3600.0,Male
8,Adelie,Torgersen,37.8,17.3,180.0,3700.0,


In [None]:
# Ihr seid Helden! Bald werden sich die Dinge immer mehr wiederholen!