# Wer hat sich einen Bonus verdient?

## Datensatz importieren

In [1255]:
import pandas as pd
from numpy.ma.extras import vstack

dataset = pd.read_csv('bildungsziele_bonus.csv', sep=';')

## Aufteilung in Features and Labels

In [1256]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [1257]:
print(X)

  Betriebliche Bildungsziele Schulische Bildungsziele Bildungsziele der ÜK  \
0                   erfuellt                 erfuellt         uebertroffen   
1                   erfuellt                 erfuellt             erfuellt   
2                   erfuellt           nicht erfuellt         uebertroffen   
3                   erfuellt                 erfuellt       nicht erfuellt   
4                   erfuellt             uebertroffen       knapp erfuellt   
5                   erfuellt             uebertroffen             erfuellt   

  Fachkompetenz Methodenkompetenz Sozialkompetenz Selbstkompetenz  \
0  uebertroffen          erfuellt        erfuellt        erfuellt   
1      erfuellt          erfuellt        erfuellt    uebertroffen   
2      erfuellt    knapp erfuellt        erfuellt  knapp erfuellt   
3      erfuellt    knapp erfuellt        erfuellt        erfuellt   
4      erfuellt    knapp erfuellt    uebertroffen        erfuellt   
5      erfuellt          erfuellt    ue

In [1258]:
print(y)

0    0.8
1    0.9
2    0.2
3    0.7
4    0.4
5    0.6
Name: Bonus, dtype: float64


## Umgang mit kategorialen Daten (Bildungsziele)

In [1259]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), list(range(7)))], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [1260]:
print('Matrix of features: \n' + str(X))

Matrix of features: 
[[1.   1.   0.   0.   0.   0.   0.   1.   0.   1.   1.   0.   1.   0.
  1.   0.   0.   5.   5.5  5.   4.75 5.5  5.   5.5  6.   5.75]
 [1.   1.   0.   0.   1.   0.   0.   0.   1.   0.   1.   0.   1.   0.
  0.   0.   1.   4.5  5.5  5.5  5.   5.5  5.5  6.   5.5  6.  ]
 [1.   0.   1.   0.   0.   0.   0.   1.   1.   0.   0.   1.   1.   0.
  0.   1.   0.   4.   4.5  3.5  4.5  5.   4.5  4.5  4.5  5.5 ]
 [1.   1.   0.   0.   0.   0.   1.   0.   1.   0.   0.   1.   1.   0.
  1.   0.   0.   5.   5.   5.   4.5  4.   4.5  5.   5.5  5.  ]
 [1.   0.   0.   1.   0.   1.   0.   0.   1.   0.   0.   1.   0.   1.
  1.   0.   0.   5.   4.   4.5  5.   4.5  5.   4.5  6.   4.5 ]
 [1.   0.   0.   1.   1.   0.   0.   0.   1.   0.   1.   0.   0.   1.
  1.   0.   0.   5.   5.   5.   5.   6.   5.5  5.   5.5  5.25]]


## Aufteilung in Trainings- und Testset

In [1261]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [1262]:
print(X_train)

[[1.   0.   0.   1.   0.   1.   0.   0.   1.   0.   0.   1.   0.   1.
  1.   0.   0.   5.   4.   4.5  5.   4.5  5.   4.5  6.   4.5 ]
 [1.   1.   0.   0.   0.   0.   0.   1.   0.   1.   1.   0.   1.   0.
  1.   0.   0.   5.   5.5  5.   4.75 5.5  5.   5.5  6.   5.75]
 [1.   1.   0.   0.   0.   0.   1.   0.   1.   0.   0.   1.   1.   0.
  1.   0.   0.   5.   5.   5.   4.5  4.   4.5  5.   5.5  5.  ]
 [1.   0.   0.   1.   1.   0.   0.   0.   1.   0.   1.   0.   0.   1.
  1.   0.   0.   5.   5.   5.   5.   6.   5.5  5.   5.5  5.25]]


In [1263]:
print(X_test)

[[1.  0.  1.  0.  0.  0.  0.  1.  1.  0.  0.  1.  1.  0.  0.  1.  0.  4.
  4.5 3.5 4.5 5.  4.5 4.5 4.5 5.5]
 [1.  1.  0.  0.  1.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.  0.  1.  4.5
  5.5 5.5 5.  5.5 5.5 6.  5.5 6. ]]


In [1264]:
print(y_train)

4    0.4
0    0.8
3    0.7
5    0.6
Name: Bonus, dtype: float64


In [1265]:
print(y_test)

2    0.2
1    0.9
Name: Bonus, dtype: float64


## Feature Scaling

In [1266]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# erste ab index 7
X_train[:, 17:] = sc.fit_transform(X_train[:, 17:])
X_test[:, 17:] = sc.transform(X_test[:, 17:])

In [1267]:
print(X_train)

[[ 1.    0.    0.    1.    0.    1.    0.    0.    1.    0.    0.    1.
   0.    1.    1.    0.    0.    0.   -1.61 -1.73  0.9  -0.63  0.   -1.41
   1.   -1.39]
 [ 1.    1.    0.    0.    0.    0.    0.    1.    0.    1.    1.    0.
   1.    0.    1.    0.    0.    0.    1.15  0.58 -0.3   0.63  0.    1.41
   1.    1.39]
 [ 1.    1.    0.    0.    0.    0.    1.    0.    1.    0.    0.    1.
   1.    0.    1.    0.    0.    0.    0.23  0.58 -1.51 -1.26 -1.41  0.
  -1.   -0.28]
 [ 1.    0.    0.    1.    1.    0.    0.    0.    1.    0.    1.    0.
   0.    1.    1.    0.    0.    0.    0.23  0.58  0.9   1.26  1.41  0.
  -1.    0.28]]


In [1268]:
print(X_test)

[[ 1.    0.    1.    0.    0.    0.    0.    1.    1.    0.    0.    1.
   1.    0.    0.    1.    0.   -1.   -0.69 -6.35 -1.51  0.   -1.41 -1.41
  -5.    0.83]
 [ 1.    1.    0.    0.    1.    0.    0.    0.    1.    0.    1.    0.
   1.    0.    0.    0.    1.   -0.5   1.15  2.89  0.9   0.63  1.41  2.83
  -1.    1.94]]


## Simple Linear Regression

### Training

In [1269]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Testing

In [1270]:
y_pred = regressor.predict(X_test)

### Testing Results

In [1271]:
print(y_pred)

[0.51 0.84]


In [1272]:
print(y_test)

2    0.2
1    0.9
Name: Bonus, dtype: float64


## Multiple Linear Regression

### Training

In [1273]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Prediction

In [1274]:
print(type(y_test))

<class 'pandas.core.series.Series'>


In [1275]:
import numpy as np

# wandelt Pandas Series zu np array um, da numpy dies erwartet
y_test_arr = y_test.to_numpy()

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test_arr.reshape(len(y_test), 1)), axis=1))

[[0.51 0.2 ]
 [0.84 0.9 ]]


In [1276]:
print(type(y_test_arr))

<class 'numpy.ndarray'>


## Polynomial Regression

### Training

In [1277]:
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(X_train)

regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [1278]:
y_test_pred = regressor.predict(poly_reg.transform(X_test))

In [1279]:
print("Prediction: ", y_test_pred)
print("Actual: ", y_test.to_numpy())

Prediction:  [0.41 0.85]
Actual:  [0.2 0.9]
