A **baseline** is a method that uses heuristics, simple summary statistics, randomness, or machine learning to create predictions for a dataset. You can use these predictions to measure the baseline's performance (here, the Levenshtein distance). This metric will then become what you compare any other machine learning algorithm against.

In [1]:
from collections import Counter

import pandas as pd
import numpy as np
import Levenshtein

dataset_path = "../input/moroccoaidatachallengeedition001/"
train_df = pd.read_csv(dataset_path + "train.csv")

# Calculate the length of plate_strings
num_digits = train_df["plate_string"].map(len)
# Get the most commun length of plate_strings
commun_length = Counter(num_digits).most_common()[0][0]
# Find the most used character for each position
# Build a plate with the most used characters
commun_characters = [[] for i in range(commun_length)]
for v in train_df[num_digits == commun_length]["plate_string"].apply(list):
    for i in range(commun_length):
        commun_characters[i].append(v[i])
baseline_plate = "".join([
    max(set(characters), key=characters.count) for characters in commun_characters])
print(baseline_plate)
# Calculate the Levenshtein distance between the real plates and baseline_plate
train_df["most_freq_plate"] = baseline_plate
print(np.mean([Levenshtein.distance(row[1], row[2]) for _, row in train_df.iterrows()]))

In [1]:
# Prediction
test_df = pd.read_csv(dataset_path + "test.csv")
test_df["plate_string"] = baseline_plate
test_df.to_csv("baseline_predictions.csv", index=False)