In [None]:
import os
import numpy as np
import pandas as pd
import nltk
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns


# Laden der Daten
TEMP_PATH = os.path.join("..", "temp")
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
train_df = pd.read_pickle(os.path.join(TEMP_PATH, "train.pickle"))
test_df = pd.read_pickle(os.path.join(TEMP_PATH, "test.pickle"))

# Überprüfen der Daten auf Überlappungen oder Lecks
common_rows = pd.merge(train_df, test_df)
if not common_rows.empty:
    print(f"Es gibt {len(common_rows)} identische Zeilen in den Trainings- und Testdaten.")
else:
    print("Keine identischen Zeilen gefunden.")

# Überprüfen der NaN-Werte
def print_nan_counts(df, message=""):
    print(message)
    nan_counts = df.isna().sum()
    if nan_counts.any():
        print(nan_counts[nan_counts > 0])
    else:
        print("No NaNs present.")

# Entfernen des %-Zeichens und Konvertierung zu FLOAT
def convert_percent_columns(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].str.rstrip('%'), errors='coerce')
    return df

percentage_columns = ['% Engaged sessions (GA4)', 'Interaction rate']
train_df = convert_percent_columns(train_df, percentage_columns)
test_df = convert_percent_columns(test_df, percentage_columns)

# Check for NaNs after converting percentage columns
print_nan_counts(train_df, "NaN counts in train_df after converting percentage columns:")
print_nan_counts(test_df, "NaN counts in test_df after converting percentage columns:")


# Anpassung der Auswahl der numerischen und kategorialen Variablen
numerical_features = [
    'Hour_Clicks', 'Hour_Cost', 'Hour_Impr.', 'Keyword_Impr', 'Keyword_Clicks', 
    'Keyword_Cost', 'Asset_Impr', 'Asset_word_count', 'Search keyword_word_count'
]

categorical_features = ['Ad_group_x', 'Asset type', 'Day_of_week', 'Hour_of_day', 'Search keyword', 'Asset']

# Konvertierung zu numerischen und string Werten
for feature in numerical_features:
    train_df[feature] = pd.to_numeric(train_df[feature], errors='coerce')
    test_df[feature] = pd.to_numeric(test_df[feature], errors='coerce')

for feature in categorical_features:
    train_df[feature] = train_df[feature].astype(str)
    test_df[feature] = test_df[feature].astype(str)

# Preprocessing Pipeline für numerische und kategoriale Features
num_transformers = [
    ('hour_clicks_scaler', StandardScaler(), ['Hour_Clicks']),
    ('hour_cost_scaler', StandardScaler(), ['Hour_Cost']),
    ('hour_impr_scaler', MinMaxScaler(), ['Hour_Impr.']),
    ('keyword_impr_scaler', MinMaxScaler(), ['Keyword_Impr']),
    ('keyword_clicks_scaler', StandardScaler(), ['Keyword_Clicks']),
    ('keyword_cost_scaler', StandardScaler(), ['Keyword_Cost']),
    ('asset_impr_scaler', StandardScaler(), ['Asset_Impr']),
    ('asset_word_count_scaler', RobustScaler(), ['Asset_word_count']),
    ('search_keyword_word_count_scaler', RobustScaler(), ['Search keyword_word_count'])
]

cat_transformers = [
    (f"{feature}_ohe", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [feature]) for feature in categorical_features
]

preprocessor = ColumnTransformer(
    transformers=num_transformers + cat_transformers,
    remainder='drop',
    sparse_threshold=0
)

# Definiere die Pipelines für Random Forest und Gradient Boosting
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Training Random Forest für CTR
X_train_ctr, X_test_ctr, y_train_ctr, y_test_ctr = train_test_split(
    train_df.drop(columns=['CTR', 'Avg. CPC']), 
    train_df['CTR'], 
    test_size=0.2, random_state=42
)
rf_pipeline.fit(X_train_ctr, y_train_ctr)
rf_train_score_ctr = rf_pipeline.score(X_train_ctr, y_train_ctr)
rf_test_score_ctr = rf_pipeline.score(X_test_ctr, y_test_ctr)
print(f"Random Forest - CTR Train Score: {rf_train_score_ctr}, Test Score: {rf_test_score_ctr}")

# Training Random Forest für Avg. CPC
X_train_cpc, X_test_cpc, y_train_cpc, y_test_cpc = train_test_split(
    train_df.drop(columns=['CTR', 'Avg. CPC']), 
    train_df['Avg. CPC'], 
    test_size=0.2, random_state=42
)
rf_pipeline.fit(X_train_cpc, y_train_cpc)
rf_train_score_cpc = rf_pipeline.score(X_train_cpc, y_train_cpc)
rf_test_score_cpc = rf_pipeline.score(X_test_cpc, y_test_cpc)
print(f"Random Forest - Avg. CPC Train Score: {rf_train_score_cpc}, Test Score: {rf_test_score_cpc}")



## Hyper-Parameter Tuning Random Forest

In [None]:
# Definieren des Parameter-Grids
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__max_features': ['auto', 'sqrt']
}

# Erstellen einer neuen Pipeline für GridSearchCV
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train_cpc, y_train_cpc)

# Beste Parameter und Score ausgeben
print("Beste Hyperparameter:", grid_search.best_params_)
print("Bester Modell Score (neg_mean_squared_error):", grid_search.best_score_)

# Verwenden des besten Modells
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train_cpc, y_train_cpc)
print("Verbesserte Test Score für Avg. CPC:", best_rf_model.score(X_test_cpc, y_test_cpc))

In [None]:
# Berechnung des Cross-Validation-Scores für CTR
cross_val_scores_ctr = cross_val_score(rf_pipeline, X_train_ctr, y_train_ctr, cv=5)
print("Cross-Validation Scores für CTR:", cross_val_scores_ctr)
print("Average Cross-Validation Score für CTR:", np.mean(cross_val_scores_ctr))

In [None]:
# Berechnung des Cross-Validation-Scores Avg. CPC
cross_val_scores = cross_val_score(rf_pipeline, X_train_cpc, y_train_cpc, cv=5)
print("Cross-Validation Scores für Avg. CPC:", cross_val_scores)
print("Average Cross-Validation Score für Avg. CPC:", np.mean(cross_val_scores))

In [None]:
#Wichtgste Features für Hypothesentestung ermitteln
# Berechnung der Metriken für CTR
ctr_pred_train = rf_pipeline.predict(X_train_ctr)
ctr_pred_test = rf_pipeline.predict(X_test_ctr)
print("CTR - Mean Absolute Error (Train):", mean_absolute_error(y_train_ctr, ctr_pred_train))
print("CTR - Mean Absolute Error (Test):", mean_absolute_error(y_test_ctr, ctr_pred_test))
print("CTR - Mean Squared Error (Train):", mean_squared_error(y_train_ctr, ctr_pred_train))
print("CTR - Mean Squared Error (Test):", mean_squared_error(y_test_ctr, ctr_pred_test))

# Berechnung der Metriken für Avg. CPC
cpc_pred_train = rf_pipeline.predict(X_train_cpc)
cpc_pred_test = rf_pipeline.predict(X_test_cpc)
print("Avg. CPC - Mean Absolute Error (Train):", mean_absolute_error(y_train_cpc, cpc_pred_train))
print("Avg. CPC - Mean Absolute Error (Test):", mean_absolute_error(y_test_cpc, cpc_pred_test))
print("Avg. CPC - Mean Squared Error (Train):", mean_squared_error(y_train_cpc, cpc_pred_train))
print("Avg. CPC - Mean Squared Error (Test):", mean_squared_error(y_test_cpc, cpc_pred_test))

# Vorbereitung der Daten für die Plotfunktion
# Korrekte Extraktion der Feature-Namen aus dem Preprocessor
feature_names = [name for transformer in preprocessor.transformers_[:-1] for name in transformer[1].get_feature_names_out()]

# Feature Importance aus dem Random Forest Modell
feature_importances = rf_pipeline.named_steps['regressor'].feature_importances_

# Zuordnung der Feature Importance zu den Namen und Einschränkung auf die wichtigsten 25 
importance_dict = dict(zip(feature_names, feature_importances))
sorted_importance = sorted(importance_dict.items(), key=lambda item: item[1], reverse=True)[:25]  # Einschränkung hier

# Anzeige der sortierten Feature Importance
print("Top-25 Feature Importance:")
for feature, importance in sorted_importance:
    print(f"{feature}: {importance}")

# Visualisierung der Top-25 Feature Importance
def plot_feature_importance(importance, names):
    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)

    # Define size of bar plot
    plt.figure(figsize=(10,8))
    # Plot Seaborn bar chart
    sns.barplot(x='feature_importance', y='feature_names', data=fi_df)
    # Add chart labels
    plt.title('Random Forest - Feature Importance')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

# Plot der Top-25 Feature Importance
plot_feature_importance([imp[1] for imp in sorted_importance], [imp[0] for imp in sorted_importance])
plt.show()


In [None]:
# Pipeline für Gradient Boosting
gb_pipeline_ctr = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=100, random_state=42))
])
gb_pipeline_cpc = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Training Gradient Boosting für CTR
gb_pipeline_ctr.fit(X_train_ctr, y_train_ctr)
gb_train_score_ctr = gb_pipeline_ctr.score(X_train_ctr, y_train_ctr)
gb_test_score_ctr = gb_pipeline_ctr.score(X_test_ctr, y_test_ctr)
print(f"Gradient Boosting - CTR Train Score: {gb_train_score_ctr}, Test Score: {gb_test_score_ctr}")

# Training Gradient Boosting für Avg. CPC
gb_pipeline_cpc.fit(X_train_cpc, y_train_cpc)
gb_train_score_cpc = gb_pipeline_cpc.score(X_train_cpc, y_train_cpc)
gb_test_score_cpc = gb_pipeline_cpc.score(X_test_cpc, y_test_cpc)
print(f"Gradient Boosting - Avg. CPC Train Score: {gb_train_score_cpc}, Test Score: {gb_test_score_cpc}")

def plot_feature_importance(model, title):
    # Auszug der Feature-Namen aus dem Preprocessor
    features = [name for transformer in preprocessor.transformers_[:-1] for name in transformer[1].get_feature_names_out()]
    importances = model.named_steps['regressor'].feature_importances_
    
    # Erstellung eines DataFrames zur besseren Handhabung
    feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(25)

    # Plotting mit Seaborn für eine bessere Visualisierung
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importances, x='Importance', y='Feature', palette='viridis')
    plt.title(title)
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()

# Feature Importance für CTR und Avg. CPC visualisieren
plot_feature_importance(gb_pipeline_ctr, "Feature Importance for CTR")
plot_feature_importance(gb_pipeline_cpc, "Feature Importance for Avg. CPC")

In [None]:
def print_top_features(model, title):
    # Zugriff auf die Feature-Importanzen und die Feature-Namen
    importances = model.named_steps['regressor'].feature_importances_
    features = [name for transformer in preprocessor.transformers_[:-1] for name in transformer[1].get_feature_names_out()]
    
    # Kombinieren der Feature-Namen und Importanzen, Sortieren und Ausgeben der Top 5
    top_features = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)[:5]
    print(f"Top 5 Features for {title}:")
    for rank, (feature, importance) in enumerate(top_features, 1):
        print(f"{rank}. {feature}: {importance:.4f}")

# Annahme: gb_pipeline_ctr und gb_pipeline_cpc sind bereits trainiert
print_top_features(gb_pipeline_ctr, "CTR")
print_top_features(gb_pipeline_cpc, "Avg. CPC")


In [None]:
# Funktion zur Visualisierung der Lernkurve
def plot_learning_curve(estimator, title, X, y, axes, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_title(title)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std, color="gray", alpha=0.1)
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std, color="g", alpha=0.1)
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
              label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
              label="Cross-validation score")
    axes.legend(loc="best")

# Visualisierung der Lernkurven für beide Modelle und beide Zielvariablen
fig, axes = plt.subplots(2, 2, figsize=(15, 10))  # 2x2 Subplots für 2 Modelle und 2 Zielvariablen

# Lernkurven für Random Forest
plot_learning_curve(rf_pipeline, "Learning Curve (Random Forest, CTR)", 
                    X_train_ctr, y_train_ctr, axes=axes[0, 0], ylim=(0.7, 1.01), cv=5, n_jobs=4)
plot_learning_curve(rf_pipeline, "Learning Curve (Random Forest, Avg. CPC)", 
                    X_train_cpc, y_train_cpc, axes=axes[0, 1], ylim=(0.7, 1.01), cv=5, n_jobs=4)

# Lernkurven für Gradient Boosting
plot_learning_curve(gb_pipeline_ctr, "Learning Curve (GBM, CTR)", 
                    X_train_ctr, y_train_ctr, axes=axes[1, 0], ylim=(0.7, 1.01), cv=5, n_jobs=4)
plot_learning_curve(gb_pipeline_cpc, "Learning Curve (GBM, Avg. CPC)", 
                    X_train_cpc, y_train_cpc, axes=axes[1, 1], ylim=(0.7, 1.01), cv=5, n_jobs=4)

plt.tight_layout()
plt.show()
