## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn
from sklearn.model_selection import train_test_split

## Create Model

In [2]:
# Bring in already cleaned data
train_data = pd.read_csv('train_measurements.csv')
test_data = pd.read_csv('test_measurements.csv')
total_data = pd.concat([train_data, test_data]) # data needs to be concatenated for equal length tfidf features

# Tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_projects = tfidf_vectorizer.fit_transform(total_data.lemmatized_sw_text) # vectorize text
tfidf_vectorizer.get_feature_names_out()

array(['aa', 'ab', 'abandance', ..., 'µm', 'λcdm', 'μm'], dtype=object)

In [3]:
# Define weights since this is an imbalanced dataset - the number of projects in each band is not uniformly distributed
weights = [0] * 9
band_count = pd.DataFrame(train_data['band'].value_counts().reset_index()).sort_values(by=['band'])
for i in range(len(band_count['count'])):
    weights[i] = len(train_data)/(9*band_count['count'].iloc[i])

In [4]:
# Call model: Multinomial Naive Bayes since it works well with text, use weights
clf = sklearn.naive_bayes.MultinomialNB(class_prior = weights)
clf.fit(tfidf_projects[0:len(train_data)], train_data['band'])

In [5]:
# Obtain probabilities of each class for each project
sorted_indices = np.argsort(clf.predict_proba(tfidf_projects[len(train_data):len(total_data)])) # organizes index predictions from lowest probability to highest probability

for prediction in range(len(sorted_indices)):
    for band in range(len(sorted_indices[prediction])):
        if sorted_indices[prediction][band] != 0:
            sorted_indices[prediction][band] += 2 # need to add 2 to index to equal the band that was predicted band (3, 4, 5, 6, 7, 8, 9, or 10)
        else:
            sorted_indices[prediction][band] += 1 # need to add 1 to index to equal the band that was predicted (band 1)

In [6]:
# Add project code to ranked band predictions
list_proj = []
for i in range(len(sorted_indices)):
    pred_list = []
    pred_list.append(test_data['project_code'].iloc[i])
    pred_list.append(sorted_indices[i].tolist())
    list_proj.append(pred_list)
    
band_predictions = pd.DataFrame(list_proj)
band_predictions.columns = ['project_code', 'band_predictions']
band_predictions = band_predictions.drop_duplicates(subset='project_code', keep='first') # only keep one prediction per project code
band_predictions = band_predictions.set_index('project_code')
band_predictions.to_csv('band_prediction.csv', index = True) # save data frame of project and vector of band predictions (least likely to most likely band) as csv