# Explorative Datenanalyse (EDA),
Dieses Notebook gibt einen ersten Überblick über die erzeugten Features (features_lagged.csv).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [2]:
DATA_PROCESSED = Path("../data/processed")
INPUT = DATA_PROCESSED / "features_lagged.csv"
print(f"Lade {INPUT} ...")
df = pd.read_csv(INPUT, low_memory=False)

📂 Lade data\processed\features_lagged.csv ...


In [3]:
print("\n== Shape ==")
print(df.shape)


== Shape ==
(43708, 91)


In [4]:
print("\n== Erste 5 Zeilen ==")
print(df.head())


== Erste 5 Zeilen ==
   match_date match_time_sort Division   HomeTeam    AwayTeam  FTHome  FTAway  \
0  2000-07-28        00:00:00       F1  Marseille      Troyes       3       1   
1  2000-07-28        00:00:00       F1   Paris SG  Strasbourg       3       1   
2  2000-07-29        00:00:00       F1    Auxerre       Sedan       0       1   
3  2000-07-29        00:00:00       F1   Bordeaux        Metz       1       1   
4  2000-07-29        00:00:00       F1   Guingamp  St Etienne       2       2   

   HomeElo  AwayElo Division_1  ... away_gf_last3 away_ga_last3  \
0  1686.34  1586.57         F1  ...           NaN           NaN   
1  1714.89  1642.51         F1  ...           NaN           NaN   
2  1635.58  1624.22         F1  ...           NaN           NaN   
3  1734.34  1673.11         F1  ...           NaN           NaN   
4  1578.51  1620.74         F1  ...           NaN           NaN   

  away_pts_last3 away_elo_trend3 away_gf_last5  away_ga_last5  away_pts_last5  \
0      

In [5]:
print("\n== Spaltenübersicht ==")
print(df.columns.tolist())


== Spaltenübersicht ==
['match_date', 'match_time_sort', 'Division', 'HomeTeam', 'AwayTeam', 'FTHome', 'FTAway', 'HomeElo', 'AwayElo', 'Division_1', 'MatchDate', 'MatchTime', 'HomeTeam_1', 'AwayTeam_1', 'cfmd_match_id', 'FTHome_1', 'FTAway_1', 'y_outcome', 'y_home_win', 'HTHome', 'HTAway', 'HTResult', 'HomeElo_1', 'AwayElo_1', 'elo_diff', 'elo_ratio', 'Form3Home', 'Form3Away', 'form3_diff', 'Form5Home', 'Form5Away', 'form5_diff', 'OddHome', 'OddDraw', 'OddAway', 'qh', 'qd', 'qa', 'qsum', 'p_home', 'p_draw', 'p_away', 'overround', 'MaxHome', 'MaxDraw', 'MaxAway', 'log_odds_away_vs_home', 'HomeShots', 'AwayShots', 'shots_diff', 'HomeTarget', 'AwayTarget', 'shots_on_target_diff', 'shot_acc_home', 'shot_acc_away', 'shot_acc_diff', 'HomeFouls', 'AwayFouls', 'fouls_diff', 'HomeYellow', 'AwayYellow', 'yellow_diff', 'HomeRed', 'AwayRed', 'red_diff', 'HomeCorners', 'AwayCorners', 'corners_diff', 'dominance_index', 'form_momentum_home', 'form_momentum_away', 'home_gf_last3', 'home_ga_last3', 'h

In [6]:
print("\n== Dtypes ==")
print(df.dtypes.value_counts())


== Dtypes ==
float64    73
object     13
int64       5
Name: count, dtype: int64


In [7]:
print("\n== Missing Values (Top 20) ==")
print(df.isna().sum().sort_values(ascending=False).head(20))


== Missing Values (Top 20) ==
MatchTime                33002
MaxAway                   7820
log_odds_away_vs_home     7820
MaxHome                   7820
MaxDraw                   7820
shot_acc_diff             5745
shot_acc_away             5742
shot_acc_home             5722
HomeTarget                5719
AwayTarget                5719
shots_on_target_diff      5719
AwayFouls                 5670
HomeFouls                 5670
fouls_diff                5670
dominance_index           5292
corners_diff              5288
HomeCorners               5288
AwayCorners               5288
AwayShots                 4913
HomeShots                 4913
dtype: int64


In [8]:
print("\n== Deskriptive Statistik (numerische Spalten) ==")
print(df.describe().T.head(20))


== Deskriptive Statistik (numerische Spalten) ==
              count         mean         std          min          25%  \
FTHome      43708.0     1.532237    1.285618     0.000000     1.000000   
FTAway      43708.0     1.162144    1.132431     0.000000     0.000000   
HomeElo     43688.0  1692.520277  111.585112  1408.410000  1613.977500   
AwayElo     43688.0  1692.470585  111.611077  1406.620000  1613.947500   
FTHome_1    43708.0     1.532237    1.285618     0.000000     1.000000   
FTAway_1    43708.0     1.162144    1.132431     0.000000     0.000000   
y_home_win  43708.0     0.456461    0.498106     0.000000     0.000000   
HTHome      43705.0     0.681570    0.829941     0.000000     0.000000   
HTAway      43705.0     0.508226    0.720145     0.000000     0.000000   
HomeElo_1   43688.0  1692.520277  111.585112  1408.410000  1613.977500   
AwayElo_1   43688.0  1692.470585  111.611077  1406.620000  1613.947500   
elo_diff    43668.0     0.050119  145.145524  -562.720000   -8

In [9]:
# --- Korrelationen ---
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
corr = df[num_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr, cmap="coolwarm", center=0, cbar=True)
plt.title("Korrelationsmatrix (numerische Features)")
plt.tight_layout()
plt.savefig(DATA_PROCESSED / "corr_matrix.png")
plt.close()

print(f"\nKorrelationen geplottet -> {DATA_PROCESSED/'corr_matrix.png'}")


✅ Korrelationen geplottet -> data\processed\corr_matrix.png


In [10]:
# --- Plausibilitäts-Checks ---
print("\n== Plausibilitäts-Checks ==")
print("Anzahl Tore im Match vs. Summe Heim+Auswärts:")
print(df[["FTHome", "FTAway"]].sum().to_dict())


== Plausibilitäts-Checks ==
⚽ Anzahl Tore im Match vs. Summe Heim+Auswärts:
{'FTHome': 66971, 'FTAway': 50795}


In [11]:
if "home_pts_last3" in df.columns:
    print("\nVerteilung home_pts_last3:")
    print(df["home_pts_last3"].value_counts().sort_index())


📊 Verteilung home_pts_last3:
home_pts_last3
0.0    3010
1.0    5532
2.0    3377
3.0    7359
4.0    8652
5.0    3071
6.0    5521
7.0    4407
9.0    2681
Name: count, dtype: int64
