In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler
from werkzeug.wrappers import  Response
from flask import Flask,jsonify,request
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
import pickle as pk


from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_log_error, classification_report
import category_encoders as ce
from sklearn.model_selection import GridSearchCV,cross_val_score,KFold

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier


# Overview 

## Industry Classification Problem

Job industry is the category or general field in which you work. On a job application, "industry" refers to a broad category under which a number of "job titles" can fall. For example,IT is an industry for some job titles like 'System administrator', 'Python developer' and 'Data engineer', Education is an industry for some job titles like 'English teacher', 'Machine learning instructor' and so on.

This is original NLP Problem, the problem is supervised text classification, and our goal is to investigate which supervised machine learning methods are best suited to solve it. Given a new job title that comes in, we want to assign it to one of 4 industry categories.
The classifier makes the assumption that each new complaint is assigned to one and only one category. This is multi-class text classification problem.

## Dataset Used

Our dataset has two variables (job title & industry) in a csv format of more than 8,500 samples.
The dataset is imbalanced (Imbalance means that the number of data points available for different classes is different).
It is available under this link https://drive.google.com/file/d/1W_MO19MlDDUn0qCfxEaVxGKKlKHsFFly/view

## Cleaning the data.

After incepecting the dataframe i found there is 4 category in industry, and 3890 unique job in job title.
I started to display unique words in job title and i found there is a lot of punctuations and stop words.
I started to clean the data by counting the punctuations and stop words in each row and store it in extra two columns to incepect if there is a relation between industry and punctuations count or stop words count.
After that i removed punctuations and stop words from job title to clean it for vectorizer, i used TfidfVectorizer to vectorize job title, and then i removed the duplicates that produced the final dataframe.  

##  Data Imbalance

Removing duplicates partialy solved this problem by decreasing the major class 'IT' a little, So i set class_weights  parameter to 'balanced' before training the models.


## Trying couble of classifiers

I tried 6 classifiers:
   * SGDClassifier
   * KNeighbors 
   * DecisionTree
   * RandomForest
   * ExtraTree
   * MultinomialNB

I dont have a lot of experience with these models as i didn't deal with them in many problems so it was like a trials to discover what is best for the data.


## Final model

I chose SGDClassifier as it produced the highst accuarcy and started to tuning hyper parameters in trials, then i test it with classification report that show th precision, recall,  f1-score and accuarcy for the model.


## Saving the model

I used Pickle to save my model after training it on all dataset, so i can reload it fast in final API.


## RESTful API

I used Flask API to create a RESTful API for the model. The Model not recompiled or trained each request, it just predicts the given data in request.
I used Postman to test my API.


## Limitations of this methodology

The data is impalanced so my model will not work correctly with lower categories, it need more data in these classes.
If i tried to generate data under these classes with methods like: google translate i think it will improve the performance, but i think google translate method is suitable more for descriptive data not our data. 



# Counting and cleaning 

In [2]:
def count_punctuations(text):
    punctuations='''!"#$%&()'*+,-./:;<=>?@[\]^_`{|}~'''
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=text.count(i)
    return d 


def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

def remove_punc(txt):
    punctuations='''!"#$%&()'*+,-./:;<=>?@[\]^_`{|}~'''
    for i in punctuations:
        txt=txt.replace(i,' ')
    return txt

def remove_stopwords(txt):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(txt)
    stopwords_x = [w for w in word_tokens if w not in stop_words]        
    return ' '.join(stopwords_x)

# Read data

In [3]:
df = pd.read_csv(os.path.join('Job titles and industries.csv'))

print("The shape of the dataset is {}.\n\n".format(df.shape))

df

The shape of the dataset is (8586, 2).




Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT
...,...,...
8581,data entry clerk,Marketing
8582,content creator,Marketing
8583,sales & marketing manager,Marketing
8584,marketing & digital marketing consultant,Marketing


# Data incepection

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  8586 non-null   object
 1   industry   8586 non-null   object
dtypes: object(2)
memory usage: 134.3+ KB


In [5]:
# Duplicates detected
df.groupby('job title')['job title'].count().sort_values(ascending=False)

job title
marketing executive                                            91
php developer                                                  54
trainee network technician                                     53
software developer                                             53
marketing manager                                              49
                                                               ..
graduate project manager                                        1
graduate python developer                                       1
graduate required for sen teaching assistant post               1
graduate scheme maths/physics/ computer science/engineering     1
youth mentor                                                    1
Name: job title, Length: 3890, dtype: int64

In [6]:
df.groupby('industry')['industry'].count().sort_values(ascending=False)

industry
IT             4746
Marketing      2031
Education      1435
Accountancy     374
Name: industry, dtype: int64

In [7]:
pd.Series(' '.join(df['job title']).split()).value_counts()

-            2100
developer    1411
engineer     1039
manager       901
marketing     881
             ... 
sales,          1
(admin)         1
france          1
numeracy        1
paddy           1
Length: 2710, dtype: int64

# Cleaning and adding useful features 

In [8]:

def adding_features(df,txt):
    df['punct_count'] = df[txt].apply(lambda x:count_punctuations(x))
    df_punct = pd.DataFrame(list(df.punct_count))
    df = pd.merge(df, df_punct, left_index=True, right_index=True)
    df.drop(columns=['punct_count'],inplace=True)
    
    df['job title'] = df['job title'].apply(lambda x:remove_punc(x))
    
    df['stopword_count'] = df[txt].apply(lambda x:count_stopwords(x))
    df['job title']=df['job title'].apply(lambda x: remove_stopwords(x))
    
    return df


In [9]:
df=adding_features(df,'job title')
df

Unnamed: 0,job title,industry,! count,""" count",# count,$ count,% count,& count,( count,) count,...,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count,stopword_count
0,technical support helpdesk supervisor county b...,IT,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,senior technical support engineer,IT,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,head services,IT,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,js front end engineer,IT,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,network telephony controller,IT,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8581,data entry clerk,Marketing,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8582,content creator,Marketing,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8583,sales marketing manager,Marketing,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8584,marketing digital marketing consultant,Marketing,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Job title vectorizer 

In [10]:
vectorizer=  TfidfVectorizer()
vectorizer.fit(df['job title'])
pk.dump(vectorizer, open('vectorizer', 'wb'))
idf_features =  vectorizer.transform(df['job title']).toarray()
idf= pd.DataFrame(idf_features)
final_df = pd.merge(idf,df,left_index=True, right_index=True)
final_df=final_df.drop(['job title'],axis=1)

In [11]:
final_df=final_df.drop_duplicates()
final_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count,stopword_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


# Try couple of classification models with KFold

In [12]:
models = [ 
      ('SGDClassifier ',SGDClassifier()),
          ('KNeighbors', KNeighborsClassifier()),
          ('DecisionTree', DecisionTreeClassifier()),
          ('RandomForest', RandomForestClassifier()),
          ('ExtraTree ',ExtraTreesClassifier()),
          ('MultinomialNB ',MultinomialNB()),
        
         ]

In [13]:
cv = KFold(n_splits=5, random_state=1, shuffle=True) 


#build multiple clasifiers model
X = final_df.drop(['industry'],axis=1)
y = final_df['industry']


for i in models:
    scores = cross_val_score(i[1], X, y,cv=cv, n_jobs=-1)
    print(f'{i[0]}:  scores= {scores}, scores mean= {np.mean(scores)}')
    print('\n')


SGDClassifier :  scores= [0.86658196 0.88167939 0.88549618 0.87659033 0.88167939], scores mean= 0.8784054498837666


KNeighbors:  scores= [0.78144854 0.77862595 0.77862595 0.76590331 0.78498728], scores mean= 0.7779182064786883


DecisionTree:  scores= [0.78907243 0.80279898 0.8129771  0.83333333 0.80534351], scores mean= 0.8087050706292779


RandomForest:  scores= [0.84243964 0.84860051 0.85877863 0.86513995 0.85496183], scores mean= 0.8539841120498173


ExtraTree :  scores= [0.85260483 0.85877863 0.86259542 0.86641221 0.85877863], scores mean= 0.8598339427917399


MultinomialNB :  scores= [0.8360864  0.8346056  0.84096692 0.8346056  0.84605598], scores mean= 0.8384641001516371




# Choosing the best model and test it

In [14]:
model =SGDClassifier(class_weight='balanced',max_iter=1000,learning_rate='adaptive',eta0=.1)
train,test,y,y_test = train_test_split(final_df,final_df['industry'])
train=train.drop(['industry'],axis=1)
test=test.drop(['industry'],axis=1)
model.fit(train,y)
print('Accuracy on training set: {:.5f}'
     .format(model.score(train, y)))
print('Accuracy on test set: {:.5f}'
     .format(model.score(test, y_test)))

Accuracy on training set: 0.95319
Accuracy on test set: 0.87589


In [15]:
print("classification report for Training")
print('\n')
print(classification_report(model.predict(train),y))

classification report for Training


              precision    recall  f1-score   support

 Accountancy       0.98      0.93      0.96       214
   Education       0.94      0.98      0.96       673
          IT       0.96      0.96      0.96      1145
   Marketing       0.95      0.93      0.94       916

    accuracy                           0.95      2948
   macro avg       0.96      0.95      0.95      2948
weighted avg       0.95      0.95      0.95      2948



In [16]:
print("classification report for Test")
print('\n')
print(classification_report(model.predict(test),y_test))

classification report for Test


              precision    recall  f1-score   support

 Accountancy       0.81      0.84      0.82        55
   Education       0.87      0.89      0.88       253
          IT       0.90      0.91      0.90       374
   Marketing       0.86      0.84      0.85       301

    accuracy                           0.88       983
   macro avg       0.86      0.87      0.86       983
weighted avg       0.88      0.88      0.88       983



# Train the best model on all data set  

In [17]:
model =SGDClassifier(class_weight='balanced',max_iter=1000,learning_rate='adaptive',eta0=.1)
model.fit(final_df.drop(['industry'],axis=1),final_df['industry'])

SGDClassifier(class_weight='balanced', eta0=0.1, learning_rate='adaptive')

# Save the model with pickle

In [18]:
pk.dump(model, open('SGDClassifier', 'wb'))

# Function to re-process input data

In [19]:
def pre_process_input(txt):
    txt=txt.lower()
    arr=[txt]
    entry=pd.DataFrame(arr, columns=['job title'])
    final_txt=adding_features(entry,'job title')
    
    idf_features=vectorizer.transform(final_txt['job title']).toarray()
    idf= pd.DataFrame(idf_features)
    
    final_txt = pd.merge(idf,final_txt,left_index=True, right_index=True)
    final_txt=final_txt.drop(['job title'],axis=1)
    return final_txt

# Test the model on user input

In [20]:
inp='Senior Asp Net Core / Blazor Full Stack Developer'
final_txt=pre_process_input(inp)
saved_model = pk.load(open('SGDClassifier', 'rb'))
saved_model.predict(final_txt)[0]


'IT'

# RESTful API service with Flask

In [21]:
# Using postman
app = Flask(__name__) 
app.debug = True

desc=[]
saved_model = pk.load(open('SGDClassifier', 'rb'))

app = Flask(__name__) 


@app.route('/', methods=['POST'])
def addOne():
    txt=request.data
    final_txt=pre_process_input(str(txt))
    predict=saved_model.predict(final_txt)[0]
    return predict

if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 8080, app)

 * Running on http://localhost:8080/ (Press CTRL+C to quit)
127.0.0.1 - - [20/Sep/2021 17:52:12] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [20/Sep/2021 17:52:18] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [20/Sep/2021 17:52:23] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [20/Sep/2021 17:52:32] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [20/Sep/2021 17:52:39] "POST / HTTP/1.1" 200 -
