In [107]:
import numpy as np
import pandas as pd

In [108]:
data = pd.read_csv("Data/PopulationData.csv", delimiter=";", thousands=",", decimal=".")

In [109]:
data.rename(columns={ "Country (or dependency)": "Country", "Population(2020)": "Pop", "Density(P/Km²)": "Density", "Land Area(Km²)": "Area", "Migrants(net)": "Migrants", "UrbanPop %": "UrbanPop" }, inplace=True)

In [110]:
data.set_index("Country", inplace=True)

In [111]:
data.drop(columns=["#"], inplace=True)

In [112]:
data

Unnamed: 0_level_0,Pop,YearlyChange,NetChange,Density,Area,Migrants,Fert.Rate,Med.Age,UrbanPop,WorldShare
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


In [113]:
data.ffill(inplace=True)

In [114]:
def fixNA(columnName, newValue):
    data.loc[data[columnName] == "N.A.", columnName] = newValue

In [115]:
def fixPct(columnName):
    data[columnName] = data[columnName].replace(" %", "", regex=True)

In [116]:
def fixType(columnName, targetType):
    data[columnName] = data[columnName].astype(targetType)

In [117]:
for col in data.columns:
    fixNA(col, 0)
    fixPct(col)

In [118]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pop           235 non-null    int64  
 1   YearlyChange  235 non-null    object 
 2   NetChange     235 non-null    int64  
 3   Density       235 non-null    int64  
 4   Area          235 non-null    int64  
 5   Migrants      235 non-null    float64
 6   Fert.Rate     235 non-null    object 
 7   Med.Age       235 non-null    object 
 8   UrbanPop      235 non-null    object 
 9   WorldShare    235 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 28.3+ KB


In [119]:
fixType("Pop", np.int32)

In [120]:
fixType("YearlyChange", np.float16)

In [121]:
fixType("NetChange", np.int32)
fixType("Area", np.int32)
fixType("Migrants", np.int32)

In [122]:
fixType("Density", np.int16)

In [123]:
fixType("Fert.Rate", np.float16)
fixType("UrbanPop", np.float16)
fixType("WorldShare", np.float16)

In [124]:
fixType("Med.Age", np.int8)

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235 entries, China to Vatican State
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pop           235 non-null    int32  
 1   YearlyChange  235 non-null    float16
 2   NetChange     235 non-null    int32  
 3   Density       235 non-null    int16  
 4   Area          235 non-null    int32  
 5   Migrants      235 non-null    int32  
 6   Fert.Rate     235 non-null    float16
 7   Med.Age       235 non-null    int8   
 8   UrbanPop      235 non-null    float16
 9   WorldShare    235 non-null    float16
dtypes: float16(4), int16(1), int32(4), int8(1)
memory usage: 16.1+ KB


Gute Datentypen haben jetzt 40% weniger Speicherbedarf

Weniger Speicher -> mehr Geschwindigkeit

In [127]:
data.to_csv("Data/PopulationDataFixed.csv")

In [95]:
def determineBestType(colName):
    try:    
        highLow = [data[colName].max(), data[colName].min()]
    except: return str
    print(highLow)
    intTypes = [np.int16, np.int32, np.int64, int]
    floatTypes = [np.float16, np.float32, np.float64, float]
    if data[colName].dtype in intTypes:  # Wenn die Spalte eine Integer Spalte ist
        for t in intTypes:
            try:
                np.array([highLow]).astype(t)
                return t
            except: pass

    if data[colName].dtype in floatTypes:  # Wenn die Spalte eine Float Spalte ist
        for t in floatTypes:  # Suche vom kleinsten zum größten Typen denjenigen in den das Maximum passen würde
            try:
                np.array([highLow]).astype(t)
                return t
            except: pass

In [96]:
for col in data.columns:
    t = determineBestType(col)
    print(col, t)
    # fixType(col, t)

[1439323776, 801]
Pop <class 'numpy.int16'>
['3.84', '-0.03']
YearlyChange None
[13586631, -383840]
NetChange <class 'numpy.int16'>
[26337, 0]
Density <class 'numpy.int16'>
[16376870, 0]
Area <class 'numpy.int16'>
[954806.0, -653249.0]
Migrants <class 'numpy.float16'>
Fert.Rate <class 'str'>
Med.Age <class 'str'>
UrbanPop <class 'str'>
['4.25', '0.00']
WorldShare None


  np.array([highLow]).astype(t)


In [97]:
data

Unnamed: 0_level_0,Pop,YearlyChange,NetChange,Density,Area,Migrants,Fert.Rate,Med.Age,UrbanPop,WorldShare
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.39,5540090,153,9388211,-348399.0,1.7,38,61,18.47
India,1380004385,0.99,13586631,464,2973190,-532687.0,2.2,28,35,17.70
United States,331002651,0.59,1937734,36,9147420,954806.0,1.8,38,83,4.25
Indonesia,273523615,1.07,2898047,151,1811570,-98955.0,2.3,30,56,3.51
Pakistan,220892340,2.00,4327022,287,770880,-233379.0,3.6,23,35,2.83
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.06,3,50,100,0.0,0,0,10,0.00
Falkland Islands,3480,3.05,103,0,12170,0.0,0,0,66,0.00
Niue,1626,0.68,11,6,260,0.0,0,0,46,0.00
Tokelau,1357,1.27,17,136,10,0.0,0,0,0,0.00


In [105]:
urbanPopPct = data["UrbanPop"] / 100
data["UrbanPopAbsolute"] = data["Pop"] * urbanPopPct
data["RuralPopAbsolute"] = data["Pop"] * (1 - urbanPopPct)

In [126]:
data

Unnamed: 0_level_0,Pop,YearlyChange,NetChange,Density,Area,Migrants,Fert.Rate,Med.Age,UrbanPop,WorldShare
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
China,1439323776,0.389893,5540090,153,9388211,-348399,1.700195,38,61.0,18.468750
India,1380004385,0.990234,13586631,464,2973190,-532687,2.199219,28,35.0,17.703125
United States,331002651,0.589844,1937734,36,9147420,954806,1.799805,38,83.0,4.250000
Indonesia,273523615,1.070312,2898047,151,1811570,-98955,2.300781,30,56.0,3.509766
Pakistan,220892340,2.000000,4327022,287,770880,-233379,3.599609,23,35.0,2.830078
...,...,...,...,...,...,...,...,...,...,...
Montserrat,4992,0.059998,3,50,100,0,0.000000,0,10.0,0.000000
Falkland Islands,3480,3.050781,103,0,12170,0,0.000000,0,66.0,0.000000
Niue,1626,0.680176,11,6,260,0,0.000000,0,46.0,0.000000
Tokelau,1357,1.269531,17,136,10,0,0.000000,0,0.0,0.000000
