# Full data fitting for band classification model

This notebook fits the Multinomial Naive Bayes model to the full data (only line projects still, with outliers removed) for use in production.

## Import Packages

In [16]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

## Create Model

In [17]:
# Bring in already cleaned data
train_data = pd.read_csv('../../../train_measurements.csv')
test_data = pd.read_csv('../../../test_measurements.csv')
total_data = pd.concat([train_data, test_data]) # data needs to be concatenated for equal length tfidf features

# Tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(total_data.lemmatized_sw_text) # vectorize text
tfidf_vectorizer.get_feature_names_out()

array(['aa', 'ab', 'abandance', ..., 'µm', 'λcdm', 'μm'], dtype=object)

In [18]:
# Define weights since this is an imbalanced dataset - the number of projects in each band is not uniformly distributed
weights = [0] * 9
band_count = pd.DataFrame(total_data['band'].value_counts().reset_index()).sort_values(by=['band'])
for i in range(len(band_count['count'])):
    weights[i] = len(total_data)/(9*band_count['count'].iloc[i])

In [19]:
# Call model: Multinomial Naive Bayes since it works well with text, use weights
clf = MultinomialNB(class_prior = weights)
clf.fit(tfidf_projects, total_data['band'])

## Save models to joblib

In [20]:
dump(tfidf_vectorizer, 'tfidf_vectorizer_naive_bayes.joblib')
dump(clf, 'naive_bayes_model.joblib')

['naive_bayes_model.joblib']