## Fehlerbehandlung mit Pandas

Es gibt verschiedene Funktionen die Fehler erkennen können. Danach können wir die Fehler beheben.

Viele dieser Funktionen kommen von Numpy

In [1]:
import numpy as np
import pandas as pd

In [4]:
pd.isnull(np.NaN)  # Alternative: isna() -> selbe Funktion

True

In [6]:
pd.notnull(np.NaN)  # Alternative: notna() -> selbe Funktion

False

Diese Funktionen funktionieren auch auf DataFrames und Serien

In [8]:
serie = pd.Series([1, np.NaN, 3])

In [9]:
pd.isnull(serie)

0    False
1     True
2    False
dtype: bool

In [10]:
serie.notnull()

0     True
1    False
2     True
dtype: bool

In [51]:
df = pd.read_csv("Data\PopulationData.csv", delimiter=";", thousands=",", decimal=".", index_col="Country (or dependency)")
df.drop(columns=["#"], inplace=True)

In [16]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     201 non-null    float64
 6   Fert.Rate         235 non-null    object 
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 20.2+ KB


## Fehler erkennen/entfernen

Hier sehen wir, das in der Migrants(Net) Spalte einige Leere Werte (NaN) stehen

Mithilfe von fillna() oder dropna() können diese entfernt werden

In [18]:
df.isnull()

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,False,False,False,False,False,False,False,False,False,False
India,False,False,False,False,False,False,False,False,False,False
United States,False,False,False,False,False,False,False,False,False,False
Indonesia,False,False,False,False,False,False,False,False,False,False
Pakistan,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
Montserrat,False,False,False,False,False,True,False,False,False,False
Falkland Islands,False,False,False,False,False,True,False,False,False,False
Niue,False,False,False,False,False,True,False,False,False,False
Tokelau,False,False,False,False,False,True,False,False,False,False


In [21]:
df["Migrants(net)"].isnull()

Country (or dependency)
China               False
India               False
United States       False
Indonesia           False
Pakistan            False
                    ...  
Montserrat           True
Falkland Islands     True
Niue                 True
Tokelau              True
Vatican State        True
Name: Migrants(net), Length: 235, dtype: bool

In [22]:
df[df["Migrants(net)"].isnull()]

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Isle of Man,85033,0.53 %,449,149,570,,N.A.,N.A.,53 %,0.00 %
Andorra,77265,0.16 %,123,164,470,,N.A.,N.A.,88 %,0.00 %
Dominica,71986,0.25 %,178,96,750,,N.A.,N.A.,74 %,0.00 %
Cayman Islands,65722,1.19 %,774,274,240,,N.A.,N.A.,97 %,0.00 %
Bermuda,62278,-0.36 %,-228,1246,50,,N.A.,N.A.,97 %,0.00 %
Marshall Islands,59190,0.68 %,399,329,180,,N.A.,N.A.,70 %,0.00 %
Northern Mariana Islands,57559,0.60 %,343,125,460,,N.A.,N.A.,88 %,0.00 %
Greenland,56770,0.17 %,98,0,410450,,N.A.,N.A.,87 %,0.00 %
American Samoa,55191,-0.22 %,-121,276,200,,N.A.,N.A.,88 %,0.00 %
Saint Kitts & Nevis,53199,0.71 %,376,205,260,,N.A.,N.A.,33 %,0.00 %


In [23]:
df.dropna()

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Aruba,106766,0.43 %,452,593,180,201.0,1.9,41,44 %,0.00 %
Tonga,105695,1.15 %,1201,147,720,-800.0,3.6,22,24 %,0.00 %
U.S. Virgin Islands,104425,-0.15 %,-153,298,350,-451.0,2.0,43,96 %,0.00 %
Seychelles,98347,0.62 %,608,214,460,-200.0,2.5,34,56 %,0.00 %


Hier fallen jetzt alle Datensätze weg, die irgendwo NaN enthalten

Wir können auch nur einzelne Spalten mit NaN löschen

In [27]:
df.dropna(axis=1)

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
China,1439323776,0.39 %,5540090,153,9388211,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,N.A.,N.A.,0 %,0.00 %


In [34]:
df.dropna(how="all")  # how: Any -> mind. 1 NaN pro Zeile, All -> Alle Werte sind NaN in einer Zeile

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


In [35]:
df.dropna(thresh=2)

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


## Fehlerhafte Werte ersetzen

Es gibt mehrere Möglichkeiten Fehler zu korrigieren. Es kommt auf das Datenset an

### fillna

Mit fillna können NaN Werte einfach korrigiert werden

Es muss eine Methode angegeben werden:

- ffill: Füllt mit dem letzten validen vorherigen Datensatz auf

- bfill: Füllt mit dem nächsten Datensatz auf

In [41]:
df.fillna(method="ffill")

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0.0,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0.0,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0.0,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0.0,N.A.,N.A.,0 %,0.00 %


In [52]:
df["Migrants(net)"] = df["Migrants(net)"].fillna(method="ffill")

In [44]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0.0,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0.0,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0.0,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0.0,N.A.,N.A.,0 %,0.00 %


## Fehlerhafte andere Werte ersetzen

Oftmals werden Fehler durch Formatierung verursacht (z.B. Prozentzeichen). Auch diese Fehler können behoben werden, hier muss allerdings manuell gehandelt werden.

In [47]:
df["Fert.Rate"] == "N.A."

Country (or dependency)
China               False
India               False
United States       False
Indonesia           False
Pakistan            False
                    ...  
Montserrat           True
Falkland Islands     True
Niue                 True
Tokelau              True
Vatican State        True
Name: Fert.Rate, Length: 235, dtype: bool

In [48]:
df[df["Fert.Rate"] == "N.A."]

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Isle of Man,85033,0.53 %,449,149,570,0.0,N.A.,N.A.,53 %,0.00 %
Andorra,77265,0.16 %,123,164,470,0.0,N.A.,N.A.,88 %,0.00 %
Dominica,71986,0.25 %,178,96,750,0.0,N.A.,N.A.,74 %,0.00 %
Cayman Islands,65722,1.19 %,774,274,240,0.0,N.A.,N.A.,97 %,0.00 %
Bermuda,62278,-0.36 %,-228,1246,50,0.0,N.A.,N.A.,97 %,0.00 %
Marshall Islands,59190,0.68 %,399,329,180,0.0,N.A.,N.A.,70 %,0.00 %
Northern Mariana Islands,57559,0.60 %,343,125,460,0.0,N.A.,N.A.,88 %,0.00 %
Greenland,56770,0.17 %,98,0,410450,0.0,N.A.,N.A.,87 %,0.00 %
American Samoa,55191,-0.22 %,-121,276,200,0.0,N.A.,N.A.,88 %,0.00 %
Saint Kitts & Nevis,53199,0.71 %,376,205,260,0.0,N.A.,N.A.,33 %,0.00 %


In [53]:
df.loc[df["Fert.Rate"] == "N.A.", "Fert.Rate"] = 0

In [54]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0.0,0,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0.0,0,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0.0,0,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0.0,0,N.A.,0 %,0.00 %


## Typkonvertierung

Nach der Fehlerbehebung sind oftmals die Typen der entsprechenden Spalten falsch

Hier muss ein Typkonvertierung erfolgen mittels

- convert_dtypes(): Automatisch für alle Spalten, funktioniert nicht immer, wirft keine Fehler

- astype(Typ): Konvertiert eine spezifische Serie, wirft Fehler wenn die Konvertierung fehlschlägt

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    object 
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 20.2+ KB


In [57]:
df.convert_dtypes()

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0,0,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0,0,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0,0,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0,0,N.A.,0 %,0.00 %


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    object 
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 20.2+ KB


In [59]:
df["Fert.Rate"].astype(np.float64)

Country (or dependency)
China               1.7
India               2.2
United States       1.8
Indonesia           2.3
Pakistan            3.6
                   ... 
Montserrat          0.0
Falkland Islands    0.0
Niue                0.0
Tokelau             0.0
Vatican State       0.0
Name: Fert.Rate, Length: 235, dtype: float64

In [60]:
df["Fert.Rate"] = df["Fert.Rate"].astype(np.float64)

In [61]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0.0,0.0,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0.0,0.0,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0.0,0.0,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0.0,0.0,N.A.,0 %,0.00 %


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    float64
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 28.3+ KB


In [64]:
df.loc[df["Med.Age"] == "N.A.", "Med.Age"] = 0

In [65]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0.0,0.0,0,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0.0,0.0,0,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0.0,0.0,0,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0.0,0.0,0,0 %,0.00 %


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    float64
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 28.3+ KB


In [67]:
df.convert_dtypes()

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,0,0.0,0,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,0,0.0,0,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,0,0.0,0,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,0,0.0,0,0 %,0.00 %


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    object 
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    float64
 7   Med.Age           235 non-null    object 
 8   UrbanPop %        235 non-null    object 
 9   WorldShare        235 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 28.3+ KB


In [69]:
df["Med.Age"] = df["Med.Age"].astype(np.int8)

## Austauschen von einzelnen Zeichen

Hier muss auf Stringfunktionen zurückgegriffen werden

In [75]:
df["YearlyChange"] = df["YearlyChange"].replace(" %", "", regex=True).astype(np.float64)  # regex=True muss hier angegeben werden, astype kann hier direkt verwendet werden

In [76]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06,3,50,100,0.0,0.0,0,10 %,0.00 %
Falkland Islands,3480,3.05,103,0,12170,0.0,0.0,0,66 %,0.00 %
Niue,1626,0.68,11,6,260,0.0,0.0,0,46 %,0.00 %
Tokelau,1357,1.27,17,136,10,0.0,0.0,0,0 %,0.00 %


Eigene Funktion für %:

In [77]:
def fixPct(df, column, type):
    df[column] = df[column].replace(" %", "", regex=True).astype(type)

In [83]:
fixPct(df, "WorldShare", np.float64)
fixPct(df, "UrbanPop %", np.int8)

In [84]:
df

Unnamed: 0_level_0,Population(2020),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,Med.Age,UrbanPop %,WorldShare
Country (or dependency),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39,5540090,153,9388211,-348399.0,1.7,38,61,18.47
India,1380004385,0.99,13586631,464,2973190,-532687.0,2.2,28,35,17.70
United States,331002651,0.59,1937734,36,9147420,954806.0,1.8,38,83,4.25
Indonesia,273523615,1.07,2898047,151,1811570,-98955.0,2.3,30,56,3.51
Pakistan,220892340,2.00,4327022,287,770880,-233379.0,3.6,23,35,2.83
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06,3,50,100,0.0,0.0,0,10,0.00
Falkland Islands,3480,3.05,103,0,12170,0.0,0.0,0,66,0.00
Niue,1626,0.68,11,6,260,0.0,0.0,0,46,0.00
Tokelau,1357,1.27,17,136,10,0.0,0.0,0,0,0.00


In [82]:
df.loc[df["UrbanPop %"] == "N.A.", "UrbanPop %"] = 0

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population(2020)  235 non-null    int64  
 1   YearlyChange      235 non-null    float64
 2   NetChange         235 non-null    int64  
 3   Density(P/Km²)    235 non-null    int64  
 4   Land Area(Km²)    235 non-null    int64  
 5   Migrants(net)     235 non-null    float64
 6   Fert.Rate         235 non-null    float64
 7   Med.Age           235 non-null    int8   
 8   UrbanPop %        235 non-null    int8   
 9   WorldShare        235 non-null    float64
dtypes: float64(4), int64(4), int8(2)
memory usage: 25.1+ KB


In [86]:
def fixNA(df, col):
    df.loc[df[col] == "N.A.", col] = 0