### This notebook contain additional experiments, that where not covered in the report

In [1]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report
import experimental_predictors as ep

In [2]:
train_df = pd.read_feather("data/train_embedding_bert_swiss_lm.feather")
dev_df = pd.read_feather("data/dev_embedding_bert_swiss_lm.feather")
train_swissbert = pd.read_feather("data/train_embedding_swissbert.feather")
dev_swissbert = pd.read_feather("data/dev_embedding_swissbert.feather")

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
svm_gaussiannb_predictor = ep.SwissDialectPredictorSvmGaussianNB(enable_audio=True,
                                                                 enable_sentance_embedding=True,
                                                                 enable_byte_pair_tfidf=True,
                                                                 normalize_each_vector=True,
                                                                 last_classifier=GaussianNB())
                                                                   #last_classifier=make_pipeline(StandardScaler(), TruncatedSVD(n_components=220), LinearSVC()))
svm_gaussiannb_predictor.fit(train_df)
dev_predicted = svm_gaussiannb_predictor.predict(dev_df)
print(classification_report(dev_df['label'].tolist(), dev_predicted))

              precision    recall  f1-score   support

          BE       0.67      0.62      0.64      1053
          BS       0.81      0.79      0.80      1528
          LU       0.82      0.58      0.68      1017
          ZH       0.62      0.89      0.73       932

    accuracy                           0.72      4530
   macro avg       0.73      0.72      0.71      4530
weighted avg       0.74      0.72      0.72      4530



In [4]:
from byte_pair_tfidf_vectorizer import BytePairTfidfVectorizer

byte_pair_tfidf_vectorizer = BytePairTfidfVectorizer(vocab_size=1000, min_frequency=2)
byte_pair_vectorized = byte_pair_tfidf_vectorizer.fit_transform(train_df["text"].tolist())

svm_linear = make_pipeline(StandardScaler(),TruncatedSVD(n_components=50), KNeighborsClassifier(n_neighbors=20))
svm_linear.fit(byte_pair_vectorized, train_df["label"].tolist())
prediction_dev = svm_linear.predict(byte_pair_tfidf_vectorizer.transform(dev_df["text"].tolist()))
print(classification_report(dev_df["label"].tolist(), prediction_dev))

              precision    recall  f1-score   support

          BE       0.36      0.64      0.46      1053
          BS       0.68      0.37      0.48      1528
          LU       0.48      0.30      0.37      1017
          ZH       0.46      0.59      0.52       932

    accuracy                           0.46      4530
   macro avg       0.50      0.47      0.46      4530
weighted avg       0.52      0.46      0.46      4530



In [5]:
from sklearn.pipeline import make_pipeline
for enable_byte_pair_tfidf in [True, False]:
    for enable_sentence_embedding in [True, False]:
        for enable_audio in [True, False]:
            if not(enable_audio or enable_sentence_embedding or enable_sentence_embedding):
                continue
            print("enable_byte_pair_tfidf: ", enable_byte_pair_tfidf, "enable_sentence_embedding: ", enable_sentence_embedding, "enable_audio: ", enable_audio)
            seperate_gaussian = ep.SwissDialectPredictorSeperateGaussians(  audio_classifier=make_pipeline(GaussianNB()),
                                                                            sentence_embedding_classifier=make_pipeline(
#                                                                               StandardScaler(),
#                                                                               TruncatedSVD(400),
                                                                                GaussianNB()),
                                                                            tfidf_classifier= LinearSVC(),
                                                                normalize_each_vector=False,
                                                                enable_byte_pair_tfidf=enable_byte_pair_tfidf,
                                                                enable_sentance_embedding=enable_sentence_embedding,
                                                                enable_audio=enable_audio,
                                                               last_classifier=GaussianNB(), audio_weight=1)
            seperate_gaussian.fit(train_df)
            #prediction_train = seperate_gaussian.predict(train_df)
            #print(classification_report(train_df["label"].tolist(), prediction_train))
            prediction_dev = seperate_gaussian.predict(dev_df)
            print(f1_score(dev_df["label"].tolist(), prediction_dev, average='macro'))

enable_byte_pair_tfidf:  True enable_sentence_embedding:  True enable_audio:  True
0.6418187485041348
enable_byte_pair_tfidf:  True enable_sentence_embedding:  True enable_audio:  False
0.6798078536178974
enable_byte_pair_tfidf:  True enable_sentence_embedding:  False enable_audio:  True
0.4540299055423549
enable_byte_pair_tfidf:  False enable_sentence_embedding:  True enable_audio:  True
0.46991066031793327
enable_byte_pair_tfidf:  False enable_sentence_embedding:  True enable_audio:  False
0.6666013474895518
enable_byte_pair_tfidf:  False enable_sentence_embedding:  False enable_audio:  True
0.4495228098066112
