# Connect to Azure ML workspace

In [1]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset

workspace_name = 'sd-ml'
subscription_id = 'c374c749-c070-4b3b-9fb4-40a657b1d4a5' # subscription id of ADLS account
resource_group = 'rs-sd-learn-sth-new' # resource group of ADLS account

workspace = Workspace.get(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group
)

print ("workspace to be used: " + workspace.name)


workspace to be used: sd-ml


# Get or set up datastore

In [2]:

datastore_name = 'learnsthnew_datastore'
filesystem = 'learnsthnew'

subscription_id = 'c374c749-c070-4b3b-9fb4-40a657b1d4a5' # subscription id of ADLS account
resource_group = 'rs-sd-learn-sth-new' # resource group of ADLS account

account_name = 'sdsalearnsthnew' # ADLS Gen2 account name
tenant_id = '680b5d20-b41e-46f8-a077-f482d0c64dbb' # tenant id of service principal
client_id = '6df9c689-4854-45ec-a9c2-55194c54c511' # client id of service principal
client_secret = 'Is5A5Jctw1~Ge-hi4EOS_RmRahG5_s43F4' # the secret of service principal

try:
    datastore = Datastore.get(
        workspace = workspace,
        datastore_name = datastore_name
    )
except Exception as exc:
    datastore = Datastore.register_azure_data_lake_gen2(
        workspace = workspace,
        subscription_id = subscription_id,
        resource_group = resource_group,
        datastore_name = datastore_name,
        account_name = account_name, # ADLS Gen2 account name
        filesystem = filesystem, # ADLS Gen2 filesystem
        tenant_id = tenant_id, # tenant id of service principal
        client_id = client_id, # client id of service principal
        client_secret = client_secret # the secret of service principal
    )

print ("datastore to be used: " + datastore.name)


datastore to be used: learnsthnew_datastore


# Get or set up dataset

In [3]:
from azureml.core import Workspace, Datastore, Dataset

datastore_name = 'learnsthnew_datastore'
dataset_name = 'playlist_statistics'
dataset_description = 'radio song playlist statistics'

try:
    dataset = Dataset.get_by_name(
        workspace = workspace, 
        name = dataset_name
    )
except Exception as exc:
    datastore_paths = [(datastore, '/analytics/playlist_statistics.parquet/*.parquet')]
    dataset = Dataset.Tabular.from_parquet_files(path=datastore_paths, validate=False)
    dataset = dataset.register(
        workspace=workspace,
        name = dataset_name,
        description = dataset_description
    )

print ("dataset to be used: " + dataset.name)

dataset to be used: playlist_statistics


# Read data and convert dataset to pandas dataframe

In [4]:
df = dataset.to_pandas_dataframe()

# Check sample 5 rows of current dataset

In [5]:
df.head()

Unnamed: 0,radio_name,month_name,artist_and_title,played
0,RMFFM,January,#razemrobimydobro - Razem,0
1,RMFFM,January,2+1 - Chodź Pomaluj Mój Świat,0
2,RMFFM,January,Abba - Mamma Mia,1
3,RMFFM,January,Abc - The Look Of Love,0
4,RMFFM,January,Ac/dc - Thunderstruck,0


# Split the dataset into features and labels

Features - attributes that will be used to predict the target value, please note that here we are using the encoded attributes and not the original ones

Label - the label of each row, also the value that will be predicted

In [6]:
# Separate features and labels
features = ["radio_name", "month_name", "artist_and_title"]
label = 'played'
X, y = df[features].values, df[label].values

# Split dataset into train and test subsets

In [7]:
from sklearn.model_selection import train_test_split

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

Training cases: 19237748
Test cases: 8244750


# Create prediction model using pipeline, LogisticRegression and validate the model

* **Pipeline** - pipeline in that case will allow us to encode the categorical features during the preprocessing phase, than we will be able to provide raw data to the model and the encoding will be handled in the model itself
* **Accuracy classification score** - In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
* **Other metrics** - Precision, Recall, F1-Score, Support, etc, please read more in that area
* **Confusion matrix** - Compute confusion matrix to evaluate the accuracy of a classification.
![alt text](img/confusion_matrix.png)


In [8]:
# ordinal that is used here is still no the best choice, but should be good enough in that case
# for more see: https://www.kaggle.com/discdiver/category-encoders-examples (as an example)

# Train the model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Define preprocessing for categorical features (encode the Age column)
categorical_features = [0,1]
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_features_song = [2]
categorical_transformer_song = Pipeline(steps=[
    ('labelenc', OrdinalEncoder())])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('catsong', categorical_transformer_song, categorical_features_song)
    ])

# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', DecisionTreeClassifier())])


# fit the pipeline to train a logistic regression model on the training set
model = pipeline.fit(X_train, (y_train))

predictions = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))

Accuracy:  0.9571007004457383
[[7884085    5635]
 [ 348059    6971]]


# Use the model to check (predict) if given song will be played

In [9]:
predict_a = model.predict([["RMFFM","June","Rotary - Na Jednej Z Dzikich Plaż"]])
predict_b = model.predict([["RMFFM","December","Rotary - Na Jednej Z Dzikich Plaż"]])
predict_c = model.predict([["RMFFM","June","Wham! - Last Christmas"]])
predict_d = model.predict([["RMFFM","December","Wham! - Last Christmas"]])

print (f"Predict results using model 'a' for radio: RMFFM, month: June, artist and title: Rotary - Na Jednej Z Dzikich Plaż, results {str(predict_a[0])}" )
print (f"Predict results using model 'a' for radio: RMFFM, month: December, artist and title: Rotary - Na Jednej Z Dzikich Plaż, results {str(predict_b[0])}" )
print (f"Predict results using model 'a' for radio: RMFFM, month: June, artist and title: Rotary - Wham! - Last Christmas, results {str(predict_c[0])}" )
print (f"Predict results using model 'a' for radio: RMFFM, month: December, artist and title: Rotary - Wham! - Last Christmas, results {str(predict_d[0])}" )

Predict results using model 'a' for radio: RMFFM, month: June, artist and title: Rotary - Na Jednej Z Dzikich Plaż, results 1
Predict results using model 'a' for radio: RMFFM, month: December, artist and title: Rotary - Na Jednej Z Dzikich Plaż, results 0
Predict results using model 'a' for radio: RMFFM, month: June, artist and title: Rotary - Wham! - Last Christmas, results 0
Predict results using model 'a' for radio: RMFFM, month: December, artist and title: Rotary - Wham! - Last Christmas, results 1
