In [1]:
#
# Beispiele für die Benutzung von NumPy
# Bei Bedarf die Python-Module wie wget und tabulate mittels conda oder pip instalieren#
import numpy as np
import wget
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

In [2]:
from tabulate import tabulate
from scipy.stats import itemfreq

Hier wird ein Datenset von Autos herunterladen
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [3]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

In [4]:
wget.download(DATA_URL, "auto.csv")
# usecols wird benutzt, um nur bestimmte Spalten zu laden
# make: 2
# fuel: 3
# num-of-doors: 5
# price: 25

'auto (1).csv'

In [15]:
dt = [("brand","S8"), ("fuel","S10"), ("doors","S10"), ("price",float)]
all_autos= np.loadtxt("auto.csv", delimiter=",", dtype=str, usecols=[2,3,5,25])

all_autos.shape

(205, 4)

In [16]:
# Alle Preise in einer Spalte
all_autos_prices = all_autos[:,3]

all_autos_prices

array(['13495', '16500', '16500', '13950', '17450', '15250', '17710',
       '18920', '23875', '?', '16430', '16925', '20970', '21105', '24565',
       '30760', '41315', '36880', '5151', '6295', '6575', '5572', '6377',
       '7957', '6229', '6692', '7609', '8558', '8921', '12964', '6479',
       '6855', '5399', '6529', '7129', '7295', '7295', '7895', '9095',
       '8845', '10295', '12945', '10345', '6785', '?', '?', '11048',
       '32250', '35550', '36000', '5195', '6095', '6795', '6695', '7395',
       '10945', '11845', '13645', '15645', '8845', '8495', '10595',
       '10245', '10795', '11245', '18280', '18344', '25552', '28248',
       '28176', '31600', '34184', '35056', '40960', '45400', '16503',
       '5389', '6189', '6669', '7689', '9959', '8499', '12629', '14869',
       '14489', '6989', '8189', '9279', '9279', '5499', '7099', '6649',
       '6849', '7349', '7299', '7799', '7499', '7999', '8249', '8949',
       '9549', '13499', '14399', '13499', '17199', '19699', '18399',
  

In [17]:
# Manche Zeilen besitzen einen ?. Wir ersetzen diese bevor wir diese Spalte in float umkonvertieren
all_autos_prices = np.core.defchararray.replace(np.array(all_autos[:,3]), '?', '0').astype("float")

all_autos[:,3] = all_autos_prices

In [11]:
all_autos_prices

array([13495., 16500., 16500., 13950., 17450., 15250., 17710., 18920.,
       23875.,     0., 16430., 16925., 20970., 21105., 24565., 30760.,
       41315., 36880.,  5151.,  6295.,  6575.,  5572.,  6377.,  7957.,
        6229.,  6692.,  7609.,  8558.,  8921., 12964.,  6479.,  6855.,
        5399.,  6529.,  7129.,  7295.,  7295.,  7895.,  9095.,  8845.,
       10295., 12945., 10345.,  6785.,     0.,     0., 11048., 32250.,
       35550., 36000.,  5195.,  6095.,  6795.,  6695.,  7395., 10945.,
       11845., 13645., 15645.,  8845.,  8495., 10595., 10245., 10795.,
       11245., 18280., 18344., 25552., 28248., 28176., 31600., 34184.,
       35056., 40960., 45400., 16503.,  5389.,  6189.,  6669.,  7689.,
        9959.,  8499., 12629., 14869., 14489.,  6989.,  8189.,  9279.,
        9279.,  5499.,  7099.,  6649.,  6849.,  7349.,  7299.,  7799.,
        7499.,  7999.,  8249.,  8949.,  9549., 13499., 14399., 13499.,
       17199., 19699., 18399., 11900., 13200., 12440., 13860., 15580.,
      

In [18]:
# Die Zeilen deren Preis gleich 0 ist, werden aus dem Datenset rausgenommen
no_prices = np.where(all_autos_prices == 0.0)
all_autos = np.delete(all_autos, no_prices, axis=0)
all_autos_prices = np.delete(all_autos_prices, no_prices, axis=0)

In [25]:
print(f"Durschnittspreis ${np.mean(all_autos_prices)}") # Das sind nur strings
print(f"Maximum ${np.max(all_autos_prices)} {all_autos[np.argmax(all_autos_prices)]}")
print(f"Minimum ${np.min(all_autos_prices)} {all_autos[np.argmin(all_autos_prices)]}")

Durschnittspreis $13207.129353233831
Maximum $45400.0 ['mercedes-benz' 'gas' 'two' '45400.0']
Minimum $5118.0 ['subaru' 'gas' 'two' '5118.0']


In [26]:
print ("Umkonvertierung in Euro")
dollar_euro_conversion_rate = 0.88

Umkonvertierung in Euro


In [28]:
# Wir selektieren die Spalte mit dem index 3 und konvertieren den Preis nach Euros
all_autos[:,3] = np.array(all_autos[:,3], "float32") * dollar_euro_conversion_rate

In [31]:
print(tabulate(all_autos, headers=["Marke","Treibstoff","Anzahl Türe","Preis in Euro"]))

Marke          Treibstoff    Anzahl Türe      Preis in Euro
-------------  ------------  -------------  ---------------
alfa-romero    gas           two                   11875.6
alfa-romero    gas           two                   14520
alfa-romero    gas           two                   14520
audi           gas           four                  12276
audi           gas           four                  15356
audi           gas           two                   13420
audi           gas           four                  15584.8
audi           gas           four                  16649.6
audi           gas           four                  21010
bmw            gas           two                   14458.4
bmw            gas           four                  14894
bmw            gas           two                   18453.6
bmw            gas           four                  18572.4
bmw            gas           four                  21617.2
bmw            gas           four                  27068.8
bmw      

In [32]:
# Anzahl der VW im Datenset
nb_vw = len(np.where(all_autos[:,0] == "volkswagen")[0])
print(f"Anzahl von VW Autos: {nb_vw}")

Anzahl von VW Autos: 12


In [33]:
# Anzahl der Autos per Marke
# Wir benutzen hier itemfreq von scipy.stats
stats = np.array(itemfreq(all_autos[:,0]))

In [34]:
stats

array([['alfa-romero', '3'],
       ['audi', '6'],
       ['bmw', '8'],
       ['chevrolet', '3'],
       ['dodge', '9'],
       ['honda', '13'],
       ['isuzu', '2'],
       ['jaguar', '3'],
       ['mazda', '17'],
       ['mercedes-benz', '8'],
       ['mercury', '1'],
       ['mitsubishi', '13'],
       ['nissan', '18'],
       ['peugot', '11'],
       ['plymouth', '7'],
       ['porsche', '4'],
       ['renault', '2'],
       ['saab', '6'],
       ['subaru', '12'],
       ['toyota', '32'],
       ['volkswagen', '12'],
       ['volvo', '11']], dtype='<U21')

Optional: Sortierung nach Anzahl der Fahrzeuge pro Marke 
stats = stats[stats[:,1].astype(int).argsort()]

In [35]:
# Konvertierung zu einem record 
d = np.rec.fromarrays([stats[:,0], stats[:,1]], formats=['<U21','int'], names=['brand','numbers'])

In [36]:
d

rec.array([('alfa-romero',  3), ('audi',  6), ('bmw',  8),
           ('chevrolet',  3), ('dodge',  9), ('honda', 13), ('isuzu',  2),
           ('jaguar',  3), ('mazda', 17), ('mercedes-benz',  8),
           ('mercury',  1), ('mitsubishi', 13), ('nissan', 18),
           ('peugot', 11), ('plymouth',  7), ('porsche',  4),
           ('renault',  2), ('saab',  6), ('subaru', 12), ('toyota', 32),
           ('volkswagen', 12), ('volvo', 11)],
          dtype=[('brand', '<U21'), ('numbers', '<i4')])

In [37]:
# Sortiere nach Anzahl der Fahrzeuge pro Marke
d = d[d["numbers"].astype(int).argsort()]

In [38]:
# Erste Buchstabe der Automarke soll gross geschrieben werden
d["brand"] = np.char.capitalize(d["brand"])

In [None]:
# Ausgabe mit Matplotlib
plt.xticks(range(len(d)),d["brand"],rotation='vertical')
plt.bar(range(len(d)),d["numbers"])
plt.title("Anzahl der Fahrzeuge")
plt.xlabel("Marke")
plt.ylabel("Anzahl")
plt.show()