# Setup

In [1]:
# General
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

meter_file = 'src_data/_MeteringLineValue__202402102328.csv'
tarif_file = 'src_data/07-02-2024_2500_NP_99.csv'
output_dir_illustrations = './sink_illustrations/'
output_dir_data = './sink_illustrations/'
os.makedirs(output_dir_illustrations, exist_ok=True)
os.makedirs(output_dir_data, exist_ok=True)
meter_data = pd.read_csv(meter_file)
tarif_data = pd.read_csv(tarif_file)

# Exploration
Using data wrangler in VS Code for general exploration reg. distribution, nulls, types etc.

In [2]:
# Visualization setup
sns.set_theme(style="whitegrid")

# Histograms for numerical columns
numerical_cols_1 = meter_data.select_dtypes(include=[np.number]).columns
numerical_cols_2 = tarif_data.select_dtypes(include=[np.number]).columns

for col in numerical_cols_1:
    plt.figure(figsize=(8, 4))
    sns.histplot(meter_data[col].dropna(), kde=True)
    plt.title(f'Distribution of {col} in meter_data')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(output_dir_illustrations, f'meter_data_distribution_{col}.png'))
    plt.close()

for col in numerical_cols_2:
    plt.figure(figsize=(8, 4))
    sns.histplot(tarif_data[col].dropna(), kde=True)
    plt.title(f'Distribution of {col} in tarif_data')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.savefig(os.path.join(output_dir_illustrations, f'tarif_data_distribution_{col}.png'))
    plt.close()

# Count plots for categorical columns
categorical_cols_1 = meter_data.select_dtypes(include=[object]).columns
categorical_cols_2 = tarif_data.select_dtypes(include=[object]).columns

for col in categorical_cols_1:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=meter_data[col])
    plt.title(f'Distribution of {col} in meter_data')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.savefig(os.path.join(output_dir_illustrations, f'meter_data_distribution_{col}.png'))
    plt.close()

for col in categorical_cols_2:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=tarif_data[col])
    plt.title(f'Distribution of {col} in tarif_data')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.savefig(os.path.join(output_dir_illustrations, f'tarif_data_distribution_{col}.png'))
    plt.close()

# Main (explorative) results
## Meter Data
- Consumption
    - 'Id' -> each row with unique Id => no panel data
    - 'Value' -> looks like a standard distribution (bisschen linkschief)
- Time
    - 'TimestampUtc' 2023-10-12 -> to 2024-02-09 ; October to February
    - 'CreatedAt' -> similar to TimestampUtc ; 2023-10-14 to 2024-02-10
    - 'LastUpdatedAt' -> sometimes identical to 'CreatedAt'
- Other
    - 'MeteringLineId' -> Only one unique entry (FK?)
    - 'Origin' -> only value 'Getec'
    
## Tarif Data
- Pricing
    - 'TotalPrice_eur' -> coarsly 2 cluster (maybe 4 finer ones)
    - 'Grundpreis_eur_per_month' -> two clusters
    - 'Arbeitspreis_cent_per_kWh' -> looks like 3 standard deviation -> *clustering potentially interesting*
    - 'Consumption_kWh' -> perfectly flat distribution
    - 'PriceGuaranteeMonths' -> perfectly flat distribution
- Competitors
    - 'ProviderName' -> most common 'idealenergie'
    - 'TariffName' -> most common 'heim premium'
    - 'Ranking' -> effectively just a window function
- Geographical
    - 'City' -> Most frequent: Berlin
    - 'ZIP_code' -> extends above
- Time
    - LastUpdatedAt_UNIX -> (needs converting) just one value
- Other
    - 'ResultType' -> one value repeats "NP"

# Next Steps
- Clarifiy Data
- Enrich features (e.g. weather patterns, holidays, vacation times, geographic features, sociodemographic features...)
- Modelling
    - Meter Data: *not recommended*
    - Tarif Data: Cluster Tarifs, determine in which cluster(s) to compete
- General approach
    - Business: Clear strategy is required and be underlined by kpi's, data and technical infrastructure
    - Technical: Expand & Extend data


# Cluster Tarif Data (ex-post)

In [58]:


# Step 1: Preprocess the Data
def preprocess_data(df):
    # Convert LastUpdatedAt_UNIX to datetime
    df['LastUpdatedAt'] = pd.to_datetime(df['LastUpdatedAt_UNIX'], unit='s')
    df.drop(columns=['LastUpdatedAt_UNIX'], inplace=True)

    # Handle any missing values if necessary (optional)
    df.dropna(inplace=True)

    return df

# Step 2: Feature Selection
def select_features(df):
    features = df[['TotalPrice_eur', 'Grundpreis_eur_per_month', 'Arbeitspreis_cent_per_kWh', 'Ranking']]
    return features

# Step 3: Normalize the Features
def normalize_features(features):
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    return scaled_features

# Step 4: Clustering
def apply_clustering(scaled_features, n_clusters=4):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(scaled_features)
    return clusters

# Step 5: Analysis
def analyze_clusters(df, clusters):
    df['Cluster'] = clusters
    cluster_analysis = df.groupby('Cluster').mean()
    return cluster_analysis

# Load the data (assuming 'tarif_data' is already loaded as a pandas dataframe)
tarif_data = pd.read_csv('path_to_tarif_data.csv')  # Replace with your actual file path

# Preprocess the data
tarif_data = preprocess_data(tarif_data)

# Select relevant features
features = select_features(tarif_data)

# Normalize the features
scaled_features = normalize_features(features)

# Apply clustering
clusters = apply_clustering(scaled_features)

# Analyze clusters
cluster_analysis = analyze_clusters(tarif_data, clusters)

# Print the cluster analysis
print(cluster_analysis)

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=tarif_data, x='Grundpreis_eur_per_month', y='Arbeitspreis_cent_per_kWh', hue='Cluster', palette='viridis')
plt.title('Clustering of Tariff Data')
plt.xlabel('Grundpreis (EUR per month)')
plt.ylabel('Arbeitspreis (cent per kWh)')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_tarif_data.csv'

In [57]:
pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag