In [1]:
import joblib
import re
import nltk
import numpy as np
import pandas as pd
from sklearn import feature_selection, model_selection

In [2]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [3]:
lst_stopwords = nltk.corpus.stopwords.words("english")

In [4]:
import sklearn
sklearn.__version__

'1.2.2'

In [5]:
# !pip install scikit-learn==1.2.2

In [6]:
loaded_vectorizer = joblib.load('tfidf_vectorizer.joblib')
loaded_classifier = joblib.load('random_forest_model.joblib') #had to change to sklearn version 1.2.2

In [7]:
from fastapi import FastAPI
from pydantic import BaseModel
import json
import uvicorn
from pyngrok import ngrok
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio

In [8]:
app = FastAPI()

In [9]:
origins = ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [10]:
class model_input(BaseModel):
    
    text : str

In [11]:
df = pd.read_csv(r'soc_202311261432.csv')
df = df.fillna('N/A')

In [12]:
df['text_clean'] = df['JOB_DUTIES'].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

In [13]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text_clean'], df['SOC_CODE'], test_size=0.2, random_state=42)

In [14]:
X_train_vectorized = loaded_vectorizer.transform(X_train)

In [15]:
#sh pip list

In [16]:
X_names = loaded_vectorizer.get_feature_names_out()
p_value_limit = 0.99
df_features = pd.DataFrame()
for cat in np.unique(y_train):
    chi2, p = feature_selection.chi2(X_train_vectorized, y_train==cat)
    df_features = pd.concat([df_features, pd.DataFrame({'feature':X_names, 'score':1-p, 'y':cat})])
    df_features = df_features.sort_values(['y','score'], ascending=[True,False])
    df_features = df_features[(df_features['score']>p_value_limit) & (df_features['feature'].str.len() > 10)]


In [17]:
@app.post('/soc_prediction')
def soc_pred(input_parameters: model_input):
    
    input_data = input_parameters.json()
    input_dictionary = json.loads(input_data)
    
    text = input_dictionary['text']

    text = utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords)

    # Vectorize the new text using the same vectorizer
    text_vectorized = loaded_vectorizer.transform([text])

    # Make probability predictions on the test set
    predicted_soc_code = loaded_classifier.predict_proba(text_vectorized)

    # Get the top 5 predicted classes and their confidence scores for each instance
    top5_predictions = np.argsort(-predicted_soc_code[0])[:10]

    top5_labels = [loaded_classifier.classes_[idx] for idx in top5_predictions]
    top5_scores = sorted(predicted_soc_code[0], reverse=True)[:10]
    top5_titles = [df[df['SOC_CODE'] == soc].iloc[0]['SOC_TITLE'] for soc in top5_labels]
    top5_features = [list(df_features[df_features['y']==soc][0:10]['feature'].values) for soc in top5_labels]
    
    return [{'label': label, 'title': title, 'score': score, 'features': feature} for label, title, score, feature in zip(top5_labels, top5_titles, top5_scores, top5_features)]

In [18]:
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [59274]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://52e5-128-229-4-2.ngrok.io
INFO:     54.157.94.67:0 - "POST /soc_prediction HTTP/1.1" 200 OK


/var/folders/cq/41xdcs5s1nz3zlwg6k9p28pr0000gp/T/ipykernel_59274/113787859.py:4: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  input_data = input_parameters.json()
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [59274]


t=2023-12-18T13:57:14-0500 lvl=eror msg="session closed, starting reconnect loop" obj=tunnels.session obj=csess id=c92e822c1f2c err="read tcp 10.253.144.20:49726->3.12.62.205:443: read: can't assign requested address"
t=2023-12-18T13:57:14-0500 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=c92e822c1f2c err="failed to dial ngrok server with address \"connect.us.ngrok-agent.com:443\": dial tcp: lookup connect.us.ngrok-agent.com: no such host"
t=2023-12-18T13:57:14-0500 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=c92e822c1f2c err="failed to dial ngrok server with address \"connect.us.ngrok-agent.com:443\": dial tcp: lookup connect.us.ngrok-agent.com: no such host"
t=2023-12-18T13:57:15-0500 lvl=eror msg="failed to reconnect session" obj=tunnels.session obj=csess id=c92e822c1f2c err="failed to dial ngrok server with address \"connect.us.ngrok-agent.com:443\": dial tcp: lookup connect.us.ngrok-agent.com: no such host"
t=2023-12-1