In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train_data.csv')

In [3]:
data.head()

Unnamed: 0,path,transcription,action,object,location
0,wavs/speakers/xRQE5VD7rRHVdyvM/7372ca00-45c4-1...,Turn on the kitchen lights,activate,lights,kitchen
1,wavs/speakers/R3mexpM2YAtdPbL7/dae28110-44fe-1...,Turn up the temperature,increase,heat,none
2,wavs/speakers/ZebMRl5Z7dhrPKRD/b55dcfd0-455d-1...,OK now switch the main language to Chinese,change language,Chinese,none
3,wavs/speakers/ppzZqYxGkESMdA5Az/61c54a20-4476-...,Turn down the bathroom temperature,decrease,heat,washroom
4,wavs/speakers/zaEBPeMY4NUbDnZy/8ef57ec0-44df-1...,Change the language,change language,none,none


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11566 entries, 0 to 11565
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   path           11566 non-null  object
 1   transcription  11566 non-null  object
 2   action         11566 non-null  object
 3   object         11566 non-null  object
 4   location       11566 non-null  object
dtypes: object(5)
memory usage: 451.9+ KB


In [5]:
data.isna().sum()

path             0
transcription    0
action           0
object           0
location         0
dtype: int64

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    return ' '.join(words)

In [7]:
data['transcription'] = data['transcription'].apply(preprocess_text)
data['label'] = data['action'] + '_' + data['object'] + '_' + data['location']

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(data['transcription'], data['label'], random_state=42)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_valid = vectorizer.transform(X_valid)

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.transform(y_valid)

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_valid)

acc = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='macro')
recall = recall_score(y_valid, y_pred, average='macro')
f1 = f1_score(y_valid, y_pred, average='macro')
print('Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1-score: {:.2f}'.format(acc, precision, recall, f1))


Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1-score: 1.00


In [11]:
new_text = 'turn on the lights'
new_text = preprocess_text(new_text)
new_text_vec = vectorizer.transform([new_text])
predicted_label = encoder.inverse_transform(clf.predict(new_text_vec))
print(predicted_label[0])


activate_lights_none


In [12]:
def Predicted_Result(new_text):
    string = preprocess_text(new_text)
    new_text_vec = vectorizer.transform([new_text])
    predicted_label = encoder.inverse_transform(clf.predict(new_text_vec))
    lst = predicted_label[0].split("_")
    print("Action: ",lst[0])
    print("Object: ",lst[1])
    print("Location: ",lst[2])

In [13]:
Predicted_Result("turn on the lights")

Action:  activate
Object:  lights
Location:  none


In [14]:
import joblib


joblib.dump

<function joblib.numpy_pickle.dump(value, filename, compress=0, protocol=None, cache_size=None)>