In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('IT_jobs.csv')

In [3]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 7000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [5]:
len(x[0])

7000

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [7]:
from sklearn.svm import SVC

classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(x_train, y_train)

SVC(kernel='linear', random_state=0)

In [8]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [9]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[['Information Security Analyst' 'Information Security Analyst']
 ['Data Architect' 'Data Architect']
 ['Business Analyst' 'Business Analyst']
 ...
 ['Network Architect' 'Network Architect']
 ['Business Intelligence Analyst' 'Business Analyst']
 ['IT Consultant' 'IT Consultant']]


In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n",accuracy_score(y_test, y_pred) * 100,"%")

[[44  0  1  0  1  2  1  2  2  0  2  4  0  2  0  1  1  0  0  1  6  0  2  0
   3]
 [ 1 35  0  0  2  1  0  3 19  2  3  1  1  0  0  0  0  0  0  0  1  2  0  0
   2]
 [ 0  0 48  6  3  1  6  0  0  4  0  0  0  2  1  0  0  2  1  1  0  0  2  3
   6]
 [ 2  0  9 49  0  0  4  1  0  2  0  5  1  5  0  1  0  1  0  1  0  0  1  0
   0]
 [ 2  0  0  2 63 13  0  1  0  0  1  0  0  1  0  0  0  1  0  2  0 12  0  0
   3]
 [ 4  1  0  0  6 44  0  0  0  0  0  0  0  2  1  0 11  2  1  1  0  1  0  1
   4]
 [ 0  0  9  8  0  1 20  0  6  4  2  3  3  1  1  0  0  0  1  3  0  0  2  1
   0]
 [ 0  5  4  2  3  3  2 34  6  1  0  1  1  1  2  0  0  0  1  0  1  4  0  1
   0]
 [ 1  5  0  2  1  0  3  3 33  1  1  0  2  1  0  0  2  3  0  0  4  2  0  5
   3]
 [ 0  0  0  0  0  2  7  2  0 36  0  0  1  9  2  0  0  1  1  0  0  2  2  4
   2]
 [ 2  1  1  0  0  0  5  2  7  0 46  4  0  5  0  1  0  0  0  0  7  0  3  0
   2]
 [ 2  2  1  6  0  1  6  3  3  0  6 32  1  2  1  0  0  0  0  1  1  0  1  0
   0]
 [ 1  2  1  1  0  1  5  1  1  2  4  3 57

In [11]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

0.601


In [12]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, y_pred, average='macro')

(0.5993354202735821, 0.5934256883683285, 0.5943789272461513, None)