# Track Machine Learning experiments and models


In [None]:
df = spark.sql("SELECT * FROM lakehouseTraining.`superstore sales dataset` LIMIT 5")
display(df)

In [None]:
import pandas as pd
import re
from pyspark.sql import SparkSession
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Initialize Spark session
spark = SparkSession.builder.appName("LakehouseTraining").getOrCreate()

# Enable Arrow optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Load the entire table into a Spark DataFrame
df = spark.sql("SELECT * FROM lakehouseTraining.`superstore sales dataset`")

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Clean the 'Sales' column by removing non-numeric characters
pandas_df['Sales'] = pandas_df['Sales'].apply(lambda x: re.sub(r'[^0-9.]', '', str(x)))

# Convert 'Sales' and 'Profit' columns to numeric
pandas_df['Sales'] = pd.to_numeric(pandas_df['Sales'], errors='coerce')
pandas_df['Profit'] = pd.to_numeric(pandas_df['Profit'], errors='coerce')

# Drop rows with NaN values
pandas_df.dropna(subset=['Sales', 'Profit'], inplace=True)

# Normalize the data
scaler = StandardScaler()
pandas_df[['Sales', 'Profit']] = scaler.fit_transform(pandas_df[['Sales', 'Profit']])

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.0125, random_state=42)

# Fit the model
iso_forest.fit(pandas_df[['Sales', 'Profit']].values)

# Predict anomalies using the same feature names
pandas_df['anomaly'] = iso_forest.predict(pandas_df[['Sales', 'Profit']].values)

# Define normal and anomaly data points
normals = pandas_df[pandas_df['anomaly'] == 1]
anomalies = pandas_df[pandas_df['anomaly'] == -1]

print("Number of anomalies detected:", len(anomalies))
#print(anomalies)

# Visualize the anomalies
plt.figure(figsize=(10, 6))
plt.scatter(normals['Sales'], normals['Profit'], c='blue', label='Normal', alpha=0.6)
plt.scatter(anomalies['Sales'], anomalies['Profit'], c='red', label='Anomaly', alpha=0.6)
plt.xlabel('Sales (Normalized)')
plt.ylabel('Profit (Normalized)')
plt.title('Anomaly Detection in Sales and Profit')
plt.legend()
plt.grid(True)
plt.show()