Import all the necessary libraries

In [3]:
import pandas as pd
import numpy as np
import string
import re
import pickle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stopwords_fr = set(stopwords.words('french'))
lemmatizer = WordNetLemmatizer()

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from gensim.models import KeyedVectors

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

Download Word2Vec model.
Here the link to download it http://vectors.nlpl.eu/repository/

In [4]:
model = KeyedVectors.load_word2vec_format('Path to the model on your computer', binary=True)

Upload train and test sets.The generation process we explained in dataset_generation file https://github.com/preste-ai/rnd-nlp-cot-chatgptdatagen/blob/main/dataset_generation.ipynb

In [5]:
train_set = pd.read_csv('Your train set', index_col=0)

In [6]:
test_set = pd.read_csv('Your test set', index_col=0)

Text preprocessing

In [7]:
# Dictionary of contractions in French language
contractions ={
    "l'" : "le ",
    "d'" : "de ",
    "j'" : "je ",
    "m'" : "me ",
    "qu'" : "que ",
    "n'" : "ne ",
    "s'" : "se "
}

In [8]:
def preprocess_text(column):
    # Lowercase
    column = column.str.lower()
    # remove contractions
    def remove_contractions(column):
        pattern = re.compile(r"\b(" + '|'.join(contractions.keys()) + r")\b")
        return pattern.sub(lambda match: contractions[match.group(0)], column)
    column = column.apply(remove_contractions)
    # Tokenization
    column = column.apply(word_tokenize)
    # Punctuation removal
    column = column.apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
    # stopword removal
    column = column.apply(lambda tokens: [token for token in tokens if token not in stopwords_fr])
    # Lemmatization
    column = column.apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

    return column

Create embeddings

In [9]:
def creating_embeddigs(features):
    documents = []
    skipped_sent = []
    for feature in features:
        word_vectors = []
        for i, word in enumerate(feature):
            
            try: 
                word_vectors.append(np.resize(model[word], (1,100)))
            except KeyError:
                #print(word)
                pass
        try:
            if len(word_vectors) != 0:
                documents.append(np.mean(word_vectors, axis=0))
            else:
                skipped_sent.append(i)
        except ValueError:
            skipped_sent.append(i)
        
    documents = np.concatenate(documents)
    features_embeddings = np.squeeze(documents)
    return features_embeddings, skipped_sent

Split data

In [10]:
def train_test_split(train_set, test_set):
    train_set['clean_features'] = preprocess_text(train_set['Phrases'])
    test_set['clean_features'] = preprocess_text(test_set['text-fr'])
    labels = train_set.loc[:, "labels_code"].values
    labels_for_test = test_set.loc[:, "labels"].values
    features = train_set['clean_features'].tolist()
    features_for_test = test_set['clean_features'].tolist()
    processed_features, skipped_sent_idx = creating_embeddigs(features)
    new_labels = np.delete(labels, skipped_sent_idx)
    processed_features_for_test, skipped_sent_idx_for_test = creating_embeddigs(features_for_test)
    new_labels_for_test = np.delete(labels_for_test, skipped_sent_idx_for_test)
    X_train = processed_features
    y_train = new_labels
    X_test = processed_features_for_test
    y_test = new_labels_for_test
    return X_train, y_train, X_test, y_test

In [11]:
X_train, y_train, X_test, y_test = train_test_split(train_set, test_set)

# SVM

Function to get metrics

In [12]:
def get_metrics(model, y_pred, y_test): 
    test_report = classification_report(y_test, y_pred, output_dict=True)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return test_report, accuracy, precision, recall, f1

Train the model and get metrics

In [13]:
def run_svm(X_train, y_train, X_test, y_test):
    # Linear SVC:
    text_clf_lsvc = Pipeline([('clf', LinearSVC())])
    text_clf_lsvc.fit(X_train, y_train)
    # Form a prediction set
    prediction_linear_SVC = text_clf_lsvc.predict(X_test)
    test_report, accuracy, precision, recall, f1 = get_metrics(model, prediction_linear_SVC, y_test)
    # Saving model to pickle file
    with open("SVM_with_rationals.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
        pickle.dump(text_clf_lsvc, file) # Dump function is used to write the object into the created file in byte format.
    return test_report, accuracy, precision, recall, f1

In [14]:
test_report, accuracy, precision, recall, f1 = run_svm(X_train, y_train, X_test, y_test)

In [15]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)

Accuracy:  0.9142538975501113
Precision:  0.9193331674304281
Recall:  0.9142538975501113
F1:  0.9139942522535754
