In [1]:
import joblib
import re
import string

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [4]:
data = pd.read_csv('100Rows4ColsBusinessAnalyst.csv', encoding = 'unicode_escape', index_col=False)
list(data.columns)

['index', 'Job Title', 'Job Description', 'Unnamed: 3', 'Industry', 'Sector']

In [5]:
df = data.drop(["index", "Unnamed: 3"], axis='columns')

In [6]:
df.head(5)

Unnamed: 0,Job Title,Job Description,Industry,Sector
0,Business Analyst - Clinical & Logistics Platform,Company Overview\n\n\nAt Memorial Sloan Ketter...,Health Care Services & Hospitals,Health Care
1,Business Analyst,We are seeking for an energetic and collaborat...,Venture Capital & Private Equity,Finance
2,Data Analyst,"For more than a decade, Asembia has been worki...",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals
3,"Information Security Analyst, Incident Response",Job Description Summary\nThe Information Secur...,Health Care Products Manufacturing,Manufacturing
4,Analyst - FP&A Global Revenue,Magnite is the world's largest independent sel...,Internet,Information Technology


In [7]:
len(df)

101

In [9]:
df['Industry'].isna().sum()

10

In [14]:
s = df['Industry'].unique()
s

array(['Health Care Services & Hospitals',
       'Venture Capital & Private Equity', 'Biotech & Pharmaceuticals',
       'Health Care Products Manufacturing', 'Internet',
       'Insurance Agencies & Brokerages',
       'Architectural & Engineering Services', 'Consulting',
       'Investment Banking & Asset Management',
       'Computer Hardware & Software',
       'Enterprise Software & Network Solutions', 'K2 Education',
       'Food & Beverage Manufacturing',
       'Financial Transaction Processing', 'Lending',
       'TV Broadcast & Cable Networks', 'Staffing & Outsourcing', nan,
       'Banks & Credit Unions', 'Radio', 'IT Services', 'Stock Exchanges',
       'Advertising & Marketing', 'Health, Beauty, & Fitness',
       'Brokerage Services', 'News Outlet',
       'Fast-Food & Quick-Service Restaurants',
       'Beauty & Personal Accessories Stores', 'Publishing',
       'Insurance Carriers', 'Accounting'], dtype=object)

In [15]:
len(s)

31

In [16]:
t = df['Sector'].unique()
t

array(['Health Care', 'Finance', 'Biotech & Pharmaceuticals',
       'Manufacturing', 'Information Technology', 'Insurance',
       'Business Services', 'Education', 'Media', nan,
       'Consumer Services', 'Restaurants, Bars & Food Services', 'Retail',
       'Accounting & Legal'], dtype=object)

In [17]:
len(t)

14

In [18]:
categories = t
categories

array(['Health Care', 'Finance', 'Biotech & Pharmaceuticals',
       'Manufacturing', 'Information Technology', 'Insurance',
       'Business Services', 'Education', 'Media', nan,
       'Consumer Services', 'Restaurants, Bars & Food Services', 'Retail',
       'Accounting & Legal'], dtype=object)

In [None]:
# news_group_data = fetch_20newsgroups(
#    subset="all", remove=("headers", "footers", "quotes"), categories=categories
# )

In [None]:
# df = pd.DataFrame(
#    dict(
#        text=news_group_data["data"],
#        target=news_group_data["target"]
#    )
#)

In [20]:
df["target"] = df['Sector']
df["target"]

0                    Health Care
1                        Finance
2      Biotech & Pharmaceuticals
3                  Manufacturing
4         Information Technology
                 ...            
96             Business Services
97                     Insurance
98                     Insurance
99        Information Technology
100            Business Services
Name: target, Length: 101, dtype: object

In [21]:
df.rename(columns = {'Job Title':'Job_Title', 'Job Description':'Job_Description'
                              }, inplace = True)

In [22]:
list(df.columns)

['Job_Title', 'Job_Description', 'Industry', 'Sector', 'target']

In [23]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.Job_Description.map(process_text)

In [24]:
df["clean_text"] 

0      company overview at memorial sloan kettering m...
1      we are seeking for an energetic and collaborat...
2      for more than a decade asembia has been workin...
3      job description summary the information securi...
4      magnite is the world s largest independent sel...
                             ...                        
96     job title business analyst location albany ny ...
97     guy carpenter is seeking candidates for the fo...
98     job descriptionjob descriptionwe re adding to ...
99     new york state immunization information system...
100    senior business analyst who can perform the ro...
Name: clean_text, Length: 101, dtype: object

In [28]:
df_na = df[df['Sector'].isna()]
df_na

Unnamed: 0,Job_Title,Job_Description,Industry,Sector,target,clean_text
27,Business Analyst,Job Description\nFree Online Training & Placem...,,,,job description free online training placement...
38,Business Analyst,Business Analyst ( for Technical Writing Role)...,,,,business analyst for technical writing role co...
55,Business Analyst,Job Description\nExperience\n\nThe successful ...,,,,job description experience the successful cand...
58,Entry Level Analyst,Are you about to graduate and get your career ...,,,,are you about to graduate and get your career ...
64,Business Analyst,SquarePeg is working with a premier financial ...,,,,squarepeg is working with a premier financial ...
76,Business Analyst,The Business Analyst (BA) works with stakehold...,,,,the business analyst ba works with stakeholder...
78,Senior Business Analyst,"Flip the Script on Rx Prices\n\nAt Flipt, we a...",,,,flip the script on rx prices at flipt we are d...
85,Business Analyst - IT,SoHo Dragon represents an Investment Bank with...,,,,soho dragon represents an investment bank with...
87,Business Analyst,The Business Analyst (BA) works with stakehold...,,,,the business analyst ba works with stakeholder...
89,Business Analyst,Our client is conducting a search for an exper...,,,,our client is conducting a search for an exper...


In [33]:
cond = df['Sector'].isin(df_na['Sector'])
df_not_na = df.drop(df[cond].index)

df_not_na

Unnamed: 0,Job_Title,Job_Description,Industry,Sector,target,clean_text
0,Business Analyst - Clinical & Logistics Platform,Company Overview\n\n\nAt Memorial Sloan Ketter...,Health Care Services & Hospitals,Health Care,Health Care,company overview at memorial sloan kettering m...
1,Business Analyst,We are seeking for an energetic and collaborat...,Venture Capital & Private Equity,Finance,Finance,we are seeking for an energetic and collaborat...
2,Data Analyst,"For more than a decade, Asembia has been worki...",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,for more than a decade asembia has been workin...
3,"Information Security Analyst, Incident Response",Job Description Summary\nThe Information Secur...,Health Care Products Manufacturing,Manufacturing,Manufacturing,job description summary the information securi...
4,Analyst - FP&A Global Revenue,Magnite is the world's largest independent sel...,Internet,Information Technology,Information Technology,magnite is the world s largest independent sel...
...,...,...,...,...,...,...
96,Business Analyst,"Job Title: Business Analyst\nLocation: Albany,...",Advertising & Marketing,Business Services,Business Services,job title business analyst location albany ny ...
97,Business Analyst,Guy Carpenter is seeking candidates for the fo...,Insurance Agencies & Brokerages,Insurance,Insurance,guy carpenter is seeking candidates for the fo...
98,Business Analyst,Job DescriptionJob DescriptionWe're adding to ...,Insurance Carriers,Insurance,Insurance,job descriptionjob descriptionwe re adding to ...
99,Business Analyst,New York State Immunization Information System...,Computer Hardware & Software,Information Technology,Information Technology,new york state immunization information system...


In [34]:
df_not_na['Sector'].isna().sum()

0

In [35]:
df_not_na.to_csv('not_na.csv')

In [39]:
df_not_na.isnull().sum()

Job_Title          0
Job_Description    0
Industry           0
Sector             0
target             0
clean_text         0
dtype: int64

In [43]:
u = df_not_na['target'].unique()
u

array(['Health Care', 'Finance', 'Biotech & Pharmaceuticals',
       'Manufacturing', 'Information Technology', 'Insurance',
       'Business Services', 'Education', 'Media', 'Consumer Services',
       'Restaurants, Bars & Food Services', 'Retail',
       'Accounting & Legal'], dtype=object)

In [40]:
%debug

df_train, df_test = train_test_split(df_not_na, test_size=0.20, stratify=df_not_na.target)

> [1;32md:\programdata\anaconda3\lib\site-packages\sklearn\utils\validation.py[0m(122)[0;36m_assert_all_finite[1;34m()[0m
[1;32m    120 [1;33m    [1;32melif[0m [0mX[0m[1;33m.[0m[0mdtype[0m [1;33m==[0m [0mnp[0m[1;33m.[0m[0mdtype[0m[1;33m([0m[1;34m"object"[0m[1;33m)[0m [1;32mand[0m [1;32mnot[0m [0mallow_nan[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    121 [1;33m        [1;32mif[0m [0m_object_dtype_isnan[0m[1;33m([0m[0mX[0m[1;33m)[0m[1;33m.[0m[0many[0m[1;33m([0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--> 122 [1;33m            [1;32mraise[0m [0mValueError[0m[1;33m([0m[1;34m"Input contains NaN"[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    123 [1;33m[1;33m[0m[0m
[0m[1;32m    124 [1;33m[1;33m[0m[0m
[0m
ipdb> up
> [1;32md:\programdata\anaconda3\lib\site-packages\sklearn\utils\validation.py[0m(800)[0;36mcheck_array[1;34m()[0m
[1;32m    798 [1;33m[1;33m[0m[0m
[0m[1;32m 

ipdb> clear
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


ValueError: Input contains NaN

In [44]:
df_train, df_test = train_test_split(df_not_na, test_size=0.20)  #, stratify=df_not_na.target

In [45]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [46]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

                           precision    recall  f1-score   support

Biotech & Pharmaceuticals       0.00      0.00      0.00         1
        Business Services       0.67      0.29      0.40         7
                  Finance       0.13      1.00      0.24         2
   Information Technology       0.00      0.00      0.00         5
                Insurance       1.00      1.00      1.00         1
            Manufacturing       0.00      0.00      0.00         1
                    Media       0.00      0.00      0.00         2

                 accuracy                           0.26        19
                macro avg       0.26      0.33      0.23        19
             weighted avg       0.31      0.26      0.22        19



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

In [48]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

sample_text = ["Space, Stars, Planets and Astronomy!"]
# Process the text in the same way you did when you trained it!
clean_sample_text = process_text(sample_text)
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

array(['Finance'], dtype='<U33')