In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('IT_jobs.csv')

In [3]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 7000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [5]:
len(x[0])

7000

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [7]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [8]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[['Information Security Analyst' 'Information Security Analyst']
 ['Data Architect' 'Data Architect']
 ['Full Stack Developer' 'Business Analyst']
 ...
 ['IT Systems Administrator' 'Network Architect']
 ['Data Analyst' 'Business Analyst']
 ['IT Consultant' 'IT Consultant']]


In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n",accuracy_score(y_test, y_pred) * 100,"%")

[[46  1  3  0  1  3  2  2  1  1  3  0  0  2  0  5  1  0  0  0  3  0  0  0
   1]
 [ 2 27  0  1  2  3  1  5 10  2  5  2  1  1  1  1  3  0  1  0  3  0  1  1
   0]
 [ 1  0 36  3  3  0 13  1  0  0  2  3  1  3  1  0  1  2  1  4  1  0  2  4
   4]
 [ 0  0  6 38  1  1  7  0  0  1  3  4  2  7  1  0  0  2  0  2  0  0  2  5
   0]
 [ 0  3  0  0 71 10  0  6  0  0  1  0  0  1  0  1  1  0  0  0  0  6  0  0
   1]
 [ 1  0  0  2 10 42  1  0  1  0  1  1  0  3  0  0  8  1  2  1  2  2  0  0
   1]
 [ 1  1  3  6  0  0 19  1  4  3  2  2  2  5  2  0  0  1  1  6  0  0  6  0
   0]
 [ 3  4  4  2  8  0  1 34  2  0  0  3  2  0  0  1  0  0  0  0  2  4  0  1
   1]
 [ 1  6  0  2  2  2  1  4 14  6  5  3  1  8  1  0  3  0  1  1  3  0  1  5
   2]
 [ 0  1  3  0  0  0  4  0  2 34  4  1  2  2  2  1  0  0  2  1  3  0  5  3
   1]
 [ 1  4  3  1  0  1  4  3  7  1 40  4  0  3  1  1  0  1  0  1  6  0  2  0
   2]
 [ 3  4  1  2  0  1  6  3  4  1  5 31  3  1  0  0  1  0  0  0  1  0  1  0
   1]
 [ 1  2  0  3  0  0  2  0  1  1  1  1 69

In [10]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.5248396948038693, 0.5225241450244157, 0.5224523978256642, None)