In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [2]:
df = pd.read_csv(r"C:\Users\sthri\Downloads\job_title_des.csv")
df

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...
...,...,...,...
2272,2399,Backend Developer,Job Summary\nPublished on : 26 days ago\nVacan...
2273,2400,Full Stack Developer,business entity cisco umbrella focus cloud-bas...
2274,2401,Network Administrator,Urgently reqd in a college in Mohali\nNetwork ...
2275,2402,Machine Learning,Key Responsibilities: Team leads for small or ...


In [3]:
df.isnull().sum()

Unnamed: 0         0
Job Title          0
Job Description    0
dtype: int64

In [4]:
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [5]:
df.rename(columns={'Job Title': 'Title', 'Job Description': 'Description'}, inplace=True)


In [6]:
df.head()

Unnamed: 0,Title,Description
0,Flutter Developer,We are looking for hire experts flutter develo...
1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,Full Stack Developer,job responsibility full stack engineer – react...


In [7]:
df['Title'].value_counts()

Title
JavaScript Developer      166
Java Developer            161
Software Engineer         160
Node js developer         160
iOS Developer             159
PHP Developer             156
Flutter Developer         155
DevOps Engineer           155
Django Developer          152
Machine Learning          152
Backend Developer         147
Network Administrator     145
Database Administrator    139
Full Stack Developer      138
Wordpress Developer       132
Name: count, dtype: int64

### NLTK Preporcessing

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sthri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sthri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sthri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
df.columns

Index(['Title', 'Description'], dtype='object')

In [10]:
data_preprocess = []
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [11]:
def preprocess_data(input, data_preprocess):
    for Description in input:
        lowered_text = Description.lower()
        translator = str.maketrans('', '', string.punctuation)
        cleaned_text = lowered_text.translate(translator)
        tokenized = word_tokenize(cleaned_text)
        stop_words_removed = [word for word in tokenized if word not in stop_words]
        lemmatized = [lemmatizer.lemmatize(word) for word in stop_words_removed]
        data_preprocess.append(' '.join(lemmatized))

In [12]:
input = df['Description']

In [13]:
job = df['Title']

In [14]:
preprocess_data(input, data_preprocess)

In [15]:
data_preprocess[0]

'looking hire expert flutter developer eligible post apply resume job type fulltime parttime salary ₹2000000 ₹4000000 per month benefit flexible schedule food allowance schedule day shift supplemental pay joining bonus overtime pay experience total work 1 year preferred housing rent subsidy yes industry software development work remotely temporarily due covid19'

### Splitting data for train and test

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
X_train,X_test, y_train,y_test = train_test_split(data_preprocess, job, test_size=0.2, random_state=42)

In [18]:
vectorizer = CountVectorizer()

In [19]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train_vec,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
y_pred = model.predict(X_test_vec)

In [23]:
accuracy_score(y_test,y_pred)

0.7850877192982456

In [24]:
report = classification_report(y_test,y_pred)
print(report)

                        precision    recall  f1-score   support

     Backend Developer       0.48      0.50      0.49        32
Database Administrator       0.96      0.96      0.96        26
       DevOps Engineer       0.92      0.87      0.89        38
      Django Developer       0.80      0.83      0.81        29
     Flutter Developer       0.92      0.97      0.94        35
  Full Stack Developer       0.68      0.65      0.67        23
        Java Developer       0.63      0.58      0.60        38
  JavaScript Developer       0.83      0.77      0.80        39
      Machine Learning       0.87      0.71      0.78        28
 Network Administrator       0.69      1.00      0.82        18
     Node js developer       0.82      0.82      0.82        34
         PHP Developer       0.69      0.92      0.79        26
     Software Engineer       0.65      0.62      0.63        32
   Wordpress Developer       0.90      0.83      0.86        23
         iOS Developer       0.97      

### New Predictions

In [25]:
import numpy as np

In [26]:
input = np.array(['''Mandate Skills - Java 8, Spring boot, Hibernate, Microservices, AWS Lambda

We are expecting a java lead who can be able to lead the team as well as the one who can contribute their individual performance in the project. 

Good Communication Skills.'''])

In [27]:
new_preprocessed = []
preprocess_data(input, new_preprocessed)

In [28]:
new_preprocessed

['mandate skill java 8 spring boot hibernate microservices aws lambda expecting java lead able lead team well one contribute individual performance project good communication skill']

In [29]:
user_input_vec = vectorizer.transform(new_preprocessed)

In [30]:
user_input_vec

<1x17408 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [31]:
model.predict(user_input_vec)

array(['Java Developer'], dtype=object)

### Saving the Model

In [52]:
import pickle

In [53]:
with open('model.pkl', 'wb') as file:
  pickle.dump(model, file)

In [54]:
with open('vectorizer.pkl', 'wb') as file:
  pickle.dump(vectorizer, file)

### Loading the model

In [55]:
with open('model.pkl', 'rb') as file:
  loaded_model = pickle.load(file)

In [56]:
loaded_model.predict(user_input_vec)

array(['Software Engineer'], dtype=object)

In [None]:
#SVM = 72
#logistic = 0.7850877192982456
#random forest =0.756578947368421
#KNN = 0.47368421052631576
#adaboost = 0.20175438596491227