In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('IT_jobs.csv')

In [3]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 25000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [6]:
len(x[0])

25000

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [8]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB()

In [9]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[['IT Consultant' 'Information Security Analyst']
 ['Data Architect' 'Data Architect']
 ['Technical Operations' 'Business Analyst']
 ...
 ['Business Intelligence Analyst' 'Network Architect']
 ['Data Warehousing' 'Business Analyst']
 ['IT Consultant' 'IT Consultant']]


In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n",accuracy_score(y_test, y_pred) * 100,"%")

[[29  3  0  2  5  2  2  1  1  0  6  5  2  0  0  2  1  0  0  0  9  1  1  2
   1]
 [ 4 25  1  1  2  6  0  6  5  0  1  4  3  2  2  0  3  1  0  0  5  1  0  1
   0]
 [ 1  2 14  7  4  1  6  4  2  2  4  3  4  4  3  1  1  6  0  5  3  2  1  4
   2]
 [ 1  1  9 17  1  1  4  7  2  2  2  5  6  7  1  1  1  1  1  3  1  3  1  2
   2]
 [ 0  0  0  2 37 12  1  8  2  0  2  4  0  4  1  3  4  1  1  3  1 12  0  0
   3]
 [ 1  2  0  1  7 38  1  0  4  1  1  2  0  2  2  0  6  1  0  1  2  4  0  2
   1]
 [ 2  2  6  5  0  1  5  2  5  8  2  4  3  3  3  1  2  0  0  0  2  0  2  3
   4]
 [ 3  7  4  1  5  1  2 18  5  2  2  4  2  1  2  0  2  1  0  6  0  2  0  0
   2]
 [ 1  1  1  1  9  1  4  4 12  3  7  5  3  2  1  1  2  0  1  2  1  5  1  3
   1]
 [ 2  1  1  4  3  1  2  1  2 19  1  1  2  3  3  2  0  3  0  1  0  6  8  4
   1]
 [ 2  2  5  2  3  3  4  3  6  3 23  5  0  5  1  3  1  0  0  0  9  1  1  1
   3]
 [ 3  2  2  4  3  1  3  3  3  2  7 20  2  1  1  0  0  0  0  3  5  1  1  1
   1]
 [ 2  3  0  2  0  0  1  4  4  6  3  1 36

In [11]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.33533998116981706, 0.32915195053364615, 0.32796714665289994, None)