### Install Fabric Semantic Link

In [None]:
%pip install semantic-link
%load_ext sempy

### Import Libraries

In [None]:
# Basic Imports
import pandas as pd
import numpy as np

In [None]:
# Fabric Imports
import sempy.fabric as fabric
import pyspark.sql.functions as F

In [None]:
# Machine Learning Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve, roc_auc_score, \
classification_report, accuracy_score, confusion_matrix 
import mlflow

### Get Reference to Power BI Data

In [None]:
df_datasets = fabric.list_datasets()
df_datasets

In [None]:
dataset = "Retail Analysis"
EXPERIMENT_NAME = "retail-total-sales-prediction"

### Examine What's Available in Power BI Data Model

In [None]:
from sempy.relationships import plot_relationship_metadata
relationships = fabric.list_relationships(dataset)
plot_relationship_metadata(relationships)

In [None]:
fabric.list_measures(dataset)

### Query for Data to use as source for Machine Learning Model

In [None]:
df = fabric.evaluate_measure(dataset, \
measure=["Average Unit Price", "TotalSales"], \
groupby_columns=["Store[Name]", "Store[Open Month]", \
                 "Store[Store Type]", "Store[Territory]", \
                "Store[DistrictID]", "Store[Chain]", "Store[Open Year]"])


df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
display(df["Store Type"].unique())
display(df["Chain"].unique())

### Build a regression model to predict Total Sales column

In [None]:
Y = df['TotalSales'].to_numpy()

In [None]:
df_with_dummies = pd.get_dummies(df, columns=['Store Type', 'Territory', 'Chain'])

In [None]:
df_with_dummies.head()

In [None]:
df_with_dummies.columns

In [None]:
X = df_with_dummies.drop(columns=['Name', 'Open Month', 'TotalSales'])

In [None]:
X

In [None]:
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog()
mlflow.sklearn.autolog(registered_model_name='retail_regression')

model = LinearRegression()
model.fit(X, Y)
Y_hat = model.predict(X)

In [None]:
r2_score = model.score(X, Y)
print('The R-square is: ', r2_score)

In [None]:
# Write your code below and press Shift+Enter to execute
ax1 = sns.distplot(df['TotalSales'], hist=False, color="r", label="Actual Value")
sns.distplot(Y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)

plt.title('Actual vs Fitted Values for TotalSales')
plt.xlabel('Total Sales')
plt.ylabel('Proportion of Stores')