# Create training data with training variables function
#### To be merged with current year good_df and then passed into relevent machine learning functions for training. The bad_df will be used as the test_df. 

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import pyarrow as pa
import pyarrow.parquet as pq
import gcsfs
import getpass
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import geopandas as gpd
import sgis as sg
import dapla as dp
import datetime
from dapla.auth import AuthClient
from dapla import FileClient

fs = FileClient.get_gcs_file_system()
import numpy as np


import warnings

warnings.filterwarnings("ignore")

In [None]:
# Hente data
from imports import *

year = 2022

training_data, imputatable_df, foretak_pub = ml_modeller.hente_training_data(year)

In [None]:
fil_path = [
    f
    for f in fs.glob(
        f"gs://ssb-strukt-naering-data-produkt-prod/naringer/klargjorte-data/statistikkfiler/aar=2022/statistikkfil_foretak_pub.parquet"
    )
    if f.endswith(".parquet")
]

# Use the ParquetDataset to read multiple files
# dataset = pq.ParquetDataset(fil_path, filesystem=fs)
foretak_pub = pd.read_parquet(fil_path, filesystem=fs)

print(foretak_pub.shape)

In [None]:
# print all columns
# for col in foretak_pub.columns:
#     print(col)

foretak_pub = foretak_pub[['omsetning', 'enhets_id', 'sysselsetting_syss', 'naring_f']]

In [None]:
foretak_pub.head() df['n3'] = df['naring_f'].str[:3]

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Keep rows where the substring (characters 2 and 3) of 'nacef_5' is '45', '46', or '47'
df = foretak_pub[foretak_pub['naring_f'].str[:2].isin(['45', '46', '47'])]

df['n3'] = df['naring_f'].str[:3]

# Initialize a dictionary to store the results for each 'naring_f'
results = {}

# Iterate over each unique 'naring_f'
for category in df['n3'].unique():
    # Filter data for the current category
    category_data = df[df['n3'] == category]
    
    # Define features and target
    X = category_data[['sysselsetting_syss']]
    y = category_data['omsetning']

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions
    y_pred = model.predict(X)
    
    # Calculate MAE for the current category
    mae = mean_absolute_error(y, y_pred)
    
    # Store the model and evaluation metric
    results[category] = {
        'model': model,
        'MAE': mae,
        'y_true': y,
        'y_pred': y_pred
    }
    
    # Visualize residuals
    plt.figure(figsize=(10, 6))
    sns.residplot(x=y, y=y_pred - y, lowess=True)
    plt.title(f'Residual Plot for {category}')
    plt.xlabel('Actual Values')
    plt.ylabel('Residuals')
    plt.show()
    
    # Visualize prediction vs actual
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')  # Line of perfect prediction
    plt.title(f'Prediction vs Actual for {category}')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Calculate the overall MAE for the entire dataset
overall_y_true = df['omsetning']
overall_y_pred = pd.concat([pd.Series(results[cat]['y_pred'], index=results[cat]['y_true'].index) for cat in results])
overall_mae = mean_absolute_error(overall_y_true, overall_y_pred)

print(f'Overall MAE: {overall_mae}')

# Show MAE per 'naring_f'
mae_df = pd.DataFrame({'n3': results.keys(), 'MAE': [results[cat]['MAE'] for cat in results]})
print(mae_df)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load your data
# Assuming training_data is already defined and loaded
# training_data = pd.read_csv('your_file.csv')  # Uncomment this if you need to load data

# Step 1: Filter the data
# Keep rows where the substring (characters 2 and 3) of 'nacef_5' is '45', '46', or '47'
df = foretak_pub[foretak_pub['naring_f'].str[:2].isin(['45', '46', '47'])]

df['n3'] = df['naring_f'].str[:3]

# Step 2: Prepare the data
X = df[['sysselsetting_syss']]  # Feature
y = df['omsetning']                # Target

# Step 3: Train the Linear Regression Model
model = LinearRegression()
model.fit(X, y)

# Predict new_oms
df['predicted_new_oms'] = model.predict(X)

# Step 4: Calculate MAE
# Overall MAE
overall_mae = mean_absolute_error(y, df['predicted_new_oms'])
print(f"Overall MAE: {overall_mae}")

# MAE per 'nacef_5'
mae_per_nacef_5 = df.groupby('naring_f').apply(
    lambda group: mean_absolute_error(group['omsetning'], group['predicted_new_oms'])
).reset_index().rename(columns={0: 'MAE'})

# Display the MAE per 'nacef_5'
print("MAE per 'nacef_5':")
print(mae_per_nacef_5)

# Step 5: Visualization
# Calculate residuals
df['residuals'] = df['omsetning'] - df['predicted_new_oms']

# Visualization - Residual Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='predicted_new_oms', y='residuals')
plt.axhline(0, color='red', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('Predicted new_oms')
plt.ylabel('Residuals')
plt.show()

# Visualization - Prediction vs Actual Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='omsetning', y='predicted_new_oms', hue='naring_f', palette='viridis')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')  # 45-degree line for reference
plt.title('Prediction vs Actual Plot')
plt.xlabel('Actual new_oms')
plt.ylabel('Predicted new_oms')
plt.legend(title='naring_f', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='omsetning', y='sysselsetting_syss')
plt.axhline(0, color='red', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('oms')
plt.ylabel('sysselsetting_syss')
plt.show()

In [None]:
# filter training_data for gjeldende_bdr_syss > 600
training_data = training_data[training_data['gjeldende_bdr_syss'] > 600]

In [None]:
# turn on option to print all columns
pd.set_option('display.max_columns', None)

training_data.head()