In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('IT_jobs.csv')

In [3]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 7000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [5]:
len(x[0])

7000

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [10]:
from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [11]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[['Information Security Analyst' 'Information Security Analyst']
 ['Data Architect' 'Data Architect']
 ['Business Analyst' 'Business Analyst']
 ...
 ['Network Architect' 'Network Architect']
 ['Business Analyst' 'Business Analyst']
 ['IT Consultant' 'IT Consultant']]


In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n",accuracy_score(y_test, y_pred) * 100,"%")

[[63  0  0  0  0  1  0  1  0  0  3  1  0  0  0  0  1  0  0  0  4  0  0  1
   0]
 [ 1 42  0  1  0  1  0  2 15  0  2  1  2  0  1  1  0  0  0  0  2  1  0  1
   0]
 [ 0  0 56  4  1  0  7  0  0  2  0  1  0  4  2  0  0  2  1  1  0  0  2  2
   1]
 [ 0  0  4 54  0  0  6  2  0  0  1  5  3  4  2  0  0  0  0  0  0  0  0  0
   1]
 [ 1  0  0  1 79  6  0  2  0  0  0  1  0  0  0  0  1  0  1  1  0  8  0  0
   0]
 [ 3  0  0  0  7 56  0  1  0  0  0  1  0  1  2  0  4  1  0  0  1  1  0  1
   0]
 [ 0  0  6  6  0  0 32  0  2  3  4  0  1  2  2  0  0  1  0  3  0  0  2  1
   0]
 [ 0  2  4  1  7  0  1 41  3  1  2  0  1  1  0  0  0  1  0  0  1  6  0  0
   0]
 [ 2  6  0  0  1  3  2  5 27  6  4  0  2  1  0  0  1  0  1  1  3  0  1  4
   2]
 [ 0  0  1  1  0  0  1  1  0 53  0  0  1  2  2  1  0  1  1  1  0  0  2  2
   1]
 [ 1  1  0  2  1  0  2  1  6  1 56  2  0  1  1  1  0  0  0  0  4  0  4  0
   2]
 [ 3  1  0  0  0  0  2  3  0  0  4 56  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  1  0  0  0  0  1  0  2  0  0  1 80

In [13]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.7254266125290891, 0.7259936980414408, 0.7239869542840541, None)

In [9]:
#pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0
Note: you may need to restart the kernel to use updated packages.
