In [1]:
# create environment for the deploy
import mlflow
from azureml.core import Workspace
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.webservice import AciWebservice
import uuid
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.model import Model

from random import shuffle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt 
import seaborn as sns 


# Preprocessing
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score



# Models
from sklearn.ensemble import RandomForestClassifier

# Metrics
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
# connect to your workspace
ws = Workspace.from_config()

# create experiment and start logging to a new run in the experiment
experiment_name = "detect-lang-rf"

# set up MLflow to track the metrics
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()

2022/04/12 04:07:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2022/04/12 04:07:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2022/04/12 04:07:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2022/04/12 04:07:02 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.
2022/04/12 04:07:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


## Function Definitions

In [3]:
# remove some special characters
def remove_special_chars(sen, filter_chars):
    sen = sen.strip()
    sen = sen.lower()
    for each in sen:
        num_ascii = ord(each)
        # delete number, ".", "\", all chars in filter_chars
        if (num_ascii > 47 and num_ascii < 58) or num_ascii == 92 or num_ascii == 46 or (each in filter_chars):
            sen = sen.replace(each, "")
    return sen

# read file csv and convert it to pandasframe
def open_file(name):
    """
    
    """
    with open('{file_name}.csv'.format(file_name = "formatted_data"), encoding='Latin1') as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content] # mỗi

    data = []
    for num, each in enumerate(content):
        each = each.split(";")

        if "." in each[1]:
            sentences = each[1].split(".") 
            filter_chars = ['\t', '!', '"', '%', '&', '*', '+', ',', '-', '/', ':', '=', '?', '@', '[', ']', '§', 
                            '«', "”", "\\", ".", '»']
                    
            for number, sen in enumerate(sentences):
                """
                insert remove special characters

                """

                # filter no meaning words
                sen = remove_special_chars(sen, filter_chars)

                # make sure a sentence have len(sentence) > 0
                if len(sen)>0:
                    data.append([each[0], sen, each[2]])

        else:
            data.append(each)

    main_data = data[1:]
    main_data = shuffle(main_data)
    df = pd.DataFrame(main_data, columns = data[0])
    return df

# get data in a row
def get_data(df, row = 60000):
    return  df.iloc[row][0], df.iloc[row][1]

# vectorize sentences and split it in to train and test file
def vectorization(df, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(list(df["text"]), list(df["language"]), test_size=test_size, random_state=42)

    # vectorize sentence X
    count_vectorizer = CountVectorizer(analyzer='char')
    X_train_features = count_vectorizer.fit_transform(X_train)
    X_test_features = count_vectorizer.transform(X_test)

    # vectorize label Y
    label_encoder = preprocessing.LabelEncoder()
    y_train_features = label_encoder.fit_transform(y_train)
    y_test_features = label_encoder.transform(y_test)
    
    # getted features
    features = count_vectorizer.get_feature_names()
    
    # getted labels
    labels = list(label_encoder.classes_)
    
    return X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer

## Load and Split Data

In [4]:
df = open_file('formatted_data')
X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer = vectorization(df)

In [11]:
df

Unnamed: 0,language,text,length_text
0,es,mi pregunta se refiere en primer lugar a las p...,733658
1,it,soltanto in questo modo le parti lese possono ...,729712
2,da,under nã¦ste runde af eu s strukturfonde vil ...,678400
3,de,schlieãlich fordern sie auch eine verbesserun...,747690
4,sv,men samtidigt ã¤r det viktigt att fã¶rhindra a...,674945
...,...,...,...
78155,da,jeg vil gerne lykã¸nske ham desvã¦rre er han i...,678400
78156,hu,szabã¡lyra hivatkozik,330524
78157,es,en primer lugar la inmunidad de los funcionari...,733658
78158,fi,olen myã¶s jã¤ttã¤nyt eddryhmã¤n puolesta asia...,694523


## Train the Model: Random Forest Classifier

In [5]:
# Create random forest model with the optimal parametter
optimal_modelRF=RandomForestClassifier(n_estimators=300, max_features= 'log2')

#Train the model using the training sets 


In [6]:
# train the model
with mlflow.start_run() as run:
    optimal_modelRF.fit(X_train_features,y_train_features)


In [7]:
# register the model
model_uri = "runs:/{}/model".format(run.info.run_id)
model = mlflow.register_model(model_uri, "lang-det-rf-model")

Registered model 'lang-det-rf-model' already exists. Creating a new version of this model...
2022/04/12 04:12:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: lang-det-rf-model, version 2
Created version '2' of model 'lang-det-rf-model'.


In [8]:
# get a curated environment
env = Environment.get(
    workspace=ws, 
    name="myenv",
    version=3
)
env.inferencing_stack_version='latest'

# create deployment config i.e. compute resources
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    tags={"data": "languages", "method": "sklearn"},
    description="Predict Language with sklearn",
)

In [None]:
%%time
# get the registered model
model = Model(ws, "lang-det-rf-model")

# create an inference config i.e. the scoring script and environment
inference_config = InferenceConfig(entry_script="score.py", environment=env)

# deploy the service
service_name = "sklearn-langdet-svc-" + str(uuid.uuid4())[:4]
service = Model.deploy(
    workspace=ws,
    name=service_name,
    models=[model],
    inference_config=inference_config,
    deployment_config=aciconfig,
)

service.wait_for_deployment(show_output=True)