In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import joblib

def load_data(file):
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip()
    df['subcategory'] = df['subcategory'].str.strip()
    df['jobtitle'] = df['jobtitle'].str.strip()
    return df

def preprocess_text(text):
    # Stemming using NLTK Porter Stemmer
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

def train_and_save_model(data_file, model_file, vectorizer_file, test_data_file):
    df = load_data(data_file)
    df['jobtitle_processed'] = df['jobtitle'].apply(preprocess_text)

    X = df['jobtitle_processed']
    y = df['subcategory']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    vectorizer = TfidfVectorizer()
    X_train_vect = vectorizer.fit_transform(X_train)

    clf = MultinomialNB()
    clf.fit(X_train_vect, y_train)

    joblib.dump(clf, model_file)
    joblib.dump(vectorizer, vectorizer_file)

    test_data = {'X_test': X_test, 'y_test': y_test}
    joblib.dump(test_data, test_data_file)

if __name__ == "__main__":
    train_and_save_model('data.csv', 'model.pkl', 'vectorizer.pkl', 'test_data.pkl')
