In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier


from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('Müllmengen_1.csv', sep = ';')

In [3]:
df.head()

Unnamed: 0,Monat,KW,Jahr,Datum,Hof,Schicht,Tour,Tonnage,Abfallart
0,1.0,1.0,2019.0,02.01.19,VMF,1.0,1.0,559,BIO
1,1.0,1.0,2019.0,02.01.19,VMF,1.0,4.0,323,BIO
2,1.0,1.0,2019.0,02.01.19,VMF,1.0,5.0,568,BIO
3,1.0,1.0,2019.0,02.01.19,VMF,1.0,6.0,548,BIO
4,1.0,1.0,2019.0,02.01.19,VMF,1.0,7.0,784,BIO


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388185 entries, 0 to 388184
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Monat      381661 non-null  float64
 1   KW         381661 non-null  float64
 2   Jahr       381661 non-null  float64
 3   Datum      381661 non-null  object 
 4   Hof        381661 non-null  object 
 5   Schicht    381661 non-null  float64
 6   Tour       381661 non-null  float64
 7   Tonnage    381661 non-null  object 
 8   Abfallart  381661 non-null  object 
dtypes: float64(5), object(4)
memory usage: 26.7+ MB


In [5]:
df.describe()

Unnamed: 0,Monat,KW,Jahr,Schicht,Tour
count,381661.0,381661.0,381661.0,381661.0,381661.0
mean,6.619073,27.008128,2021.049664,1.018522,143.118574
std,3.40801,14.880305,1.405633,0.134828,529.435676
min,1.0,1.0,2019.0,1.0,1.0
25%,4.0,14.0,2020.0,1.0,11.0
50%,7.0,27.0,2021.0,1.0,23.0
75%,10.0,40.0,2022.0,1.0,41.0
max,12.0,53.0,2023.0,2.0,5513.0


In [6]:
df_weather = pd.read_csv('daily_weather_data.csv')

In [7]:
df_weather.head()

Unnamed: 0,Date,Temperature_Max (°C),Rain_Sum (mm),Snowfall_Sum (cm),Wind_Speed_Max (km/h),Daylight_Duration (s),Temperature_Max (°C) 3-Day Avg,Rain_Sum (mm) 3-Day Avg,Snowfall_Sum (cm) 3-Day Avg,Wind_Speed_Max (km/h) 3-Day Avg,Daylight_Duration (s) 3-Day Avg
0,2018-12-28,7.1,0.4,0.0,20.2,27666.13,7.1,0.4,0.0,20.2,27666.13
1,2018-12-29,6.5,2.9,0.0,21.9,27712.32,6.8,1.65,0.0,21.05,27689.225
2,2018-12-30,7.6,4.6,0.0,30.6,27765.29,7.066667,2.633333,0.0,24.233333,27714.58
3,2018-12-31,7.3,0.0,0.0,17.6,27825.61,7.133333,2.5,0.0,23.366667,27767.74
4,2019-01-01,7.5,1.9,0.0,36.8,27893.2,7.466667,2.166667,0.0,28.333333,27828.033333


In [8]:
df_politics = pd.read_csv('WahlDaten.csv')

In [9]:
df_politics.head()

Unnamed: 0,Datum,SPD,Grünen,CDU,Linke,AfD,FDP,Sonstige
0,2019-01-01,0.15,0.23,0.17,0.18,0.13,0.07,0.07
1,2019-01-02,0.15,0.23,0.17,0.18,0.13,0.07,0.07
2,2019-01-03,0.15,0.23,0.17,0.18,0.13,0.07,0.07
3,2019-01-04,0.15,0.23,0.17,0.18,0.13,0.07,0.07
4,2019-01-05,0.15,0.23,0.17,0.18,0.13,0.07,0.07


In [10]:
df_feiertage = pd.read_csv('feiertage_berlin_2019_2023.csv')

In [11]:
df_feiertage.head()

Unnamed: 0,Datum,Feiertag
0,2019-01-01,1
1,2019-01-02,0
2,2019-01-03,0
3,2019-01-04,0
4,2019-01-05,0


In [None]:
df_feiertage = pd.read_csv('wochentage_numerisch.csv')

In [12]:
# Sicherstellen, dass das Datum als datetime-Objekt vorliegt
df['Datum'] = pd.to_datetime(df['Datum'])
df_weather['Datum'] = pd.to_datetime(df_weather['Date'])
df_politics['Datum'] = pd.to_datetime(df_politics['Datum'])
df_feiertage['Datum'] = pd.to_datetime(df_feiertage['Datum'])

  df['Datum'] = pd.to_datetime(df['Datum'])


In [13]:
# Merge mit Wetterdaten
df = df.merge(df_weather, on='Datum', how='left')
df.drop('Date', axis = 1, inplace = True)

# Merge mit Politikdaten
df = df.merge(df_politics, on='Datum', how='left')

# Merge mit Feiertagen
df = df.merge(df_feiertage, on='Datum', how='left')

In [14]:
df.head()

Unnamed: 0,Monat,KW,Jahr,Datum,Hof,Schicht,Tour,Tonnage,Abfallart,Temperature_Max (°C),...,Wind_Speed_Max (km/h) 3-Day Avg,Daylight_Duration (s) 3-Day Avg,SPD,Grünen,CDU,Linke,AfD,FDP,Sonstige,Feiertag
0,1.0,1.0,2019.0,2019-02-01,VMF,1.0,1.0,559,BIO,1.7,...,21.2,32373.56,0.15,0.23,0.17,0.18,0.13,0.07,0.07,0.0
1,1.0,1.0,2019.0,2019-02-01,VMF,1.0,4.0,323,BIO,1.7,...,21.2,32373.56,0.15,0.23,0.17,0.18,0.13,0.07,0.07,0.0
2,1.0,1.0,2019.0,2019-02-01,VMF,1.0,5.0,568,BIO,1.7,...,21.2,32373.56,0.15,0.23,0.17,0.18,0.13,0.07,0.07,0.0
3,1.0,1.0,2019.0,2019-02-01,VMF,1.0,6.0,548,BIO,1.7,...,21.2,32373.56,0.15,0.23,0.17,0.18,0.13,0.07,0.07,0.0
4,1.0,1.0,2019.0,2019-02-01,VMF,1.0,7.0,784,BIO,1.7,...,21.2,32373.56,0.15,0.23,0.17,0.18,0.13,0.07,0.07,0.0
