In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('IT_jobs.csv')

In [3]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 7000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [5]:
len(x[0])

7000

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [7]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
classifier.fit(x_train, y_train)

DecisionTreeClassifier(random_state=0)

In [8]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[['Information Security Analyst' 'Information Security Analyst']
 ['Data Architect' 'Data Architect']
 ['Deep Learning' 'Business Analyst']
 ...
 ['IT Systems Administrator' 'Network Architect']
 ['Data Analyst' 'Business Analyst']
 ['IT Consultant' 'IT Consultant']]


In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n",accuracy_score(y_test, y_pred) * 100,"%")

[[55  1  0  0  1  1  0  3  1  0  3  0  1  0  0  0  1  0  0  0  5  1  0  1
   1]
 [ 2 21  1  3  0  4  2  2 21  1  2  4  2  0  2  0  0  1  0  0  2  1  1  0
   1]
 [ 0  1 33  3  2  0 16  0  2  1  0  1  0  2  3  3  1  1  0  5  1  0  5  2
   4]
 [ 0  0  5 38  0  0 11  2  5  0  1  5  2  3  1  0  1  0  0  1  5  0  1  0
   1]
 [ 0  1  0  0 63 11  0  3  0  0  0  0  1  0  0  2  4  1  0  2  1  8  0  1
   3]
 [ 2  3  2  0  9 48  1  1  0  0  0  0  0  1  2  1  3  1  1  0  1  3  0  0
   0]
 [ 0  0  5  5  0  0 16  0  2  4  2  1  1  6  2  1  0  0  0  6  3  0  8  2
   1]
 [ 0  6  4  0 10  1  1 31  2  0  0  3  2  0  1  1  1  0  0  0  1  5  0  1
   2]
 [ 1  9  2  2  2  0  2  3 22  2  5  0  2  2  1  0  1  3  4  2  1  0  1  4
   1]
 [ 0  1  7  2  1  0  3  0  1 38  0  1  1  3  0  2  1  1  0  0  0  0  1  4
   4]
 [ 0  6  3  1  2  1  2  3  7  1 32  6  1  4  2  3  0  0  0  1  6  1  2  0
   2]
 [ 3  3  4  1  3  2  1  4  3  1  7 28  0  3  0  0  1  1  1  1  0  1  1  0
   0]
 [ 0  0  0  4  0  0  2  0  4  1  2  2 69

In [10]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.529495474194638, 0.5262431608751176, 0.5259472276623565, None)