In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
WORKING_DIR = "/home/salem/resume-classification/database/"
DATASET_FILENAME = WORKING_DIR + "resume_dataset.csv"
SAMPLE_FILENAME = WORKING_DIR + "sample.txt"

### Import Dataset

In [3]:
df = pd.read_csv(DATASET_FILENAME)
X = df.Resume
y = df.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
df

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
164,Testing,Computer Skills: â¢ Proficient in MS office (...
165,Testing,â Willingness to accept the challenges. â ...
166,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
167,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [5]:
set(df.Category)

{'Advocate',
 'Arts',
 'Automation Testing',
 'Blockchain',
 'Business Analyst',
 'Civil Engineer',
 'Data Science',
 'Database',
 'DevOps Engineer',
 'DotNet Developer',
 'ETL Developer',
 'Electrical Engineering',
 'HR',
 'Hadoop',
 'Health and fitness',
 'Java Developer',
 'Mechanical Engineer',
 'Network Security Engineer',
 'Operations Manager',
 'PMO',
 'Python Developer',
 'SAP Developer',
 'Sales',
 'Testing',
 'Web Designing'}

### Pre-processing

In [6]:
# Outputs a matrix of occurences with words as columns
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)

# Normalization
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [7]:
print(X_train_counts.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
print(X_train_tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.09051928 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [9]:
print(count_vect.get_feature_names())

['000', '01', '017', '03', '04th', '05', '050education', '06th', '07', '07education', '08', '09876971076', '0education', '10', '100', '104', '10g', '10magicalfingers', '11', '110', '110v', '11171', '118', '1180', '11g', '11geducation', '11gr2', '11kv', '12', '120', '1200', '125', '12c', '12d', '12k', '13', '132kv', '14', '147', '15', '150', '1500', '16', '163', '16th', '17', '175', '17th', '18', '1800s', '18c', '19', '1900s', '1972', '1986', '1992', '1993', '1995', '1996', '1997', '1998', '1999', '19xx', '1st', '1stexpert', '1x', '1year', '20', '2000', '2000a', '2001', '2002', '2003', '2004', '2005', '2006', '2006education', '2007', '2008', '2008r2', '2008skill', '2009', '200s', '201', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2021', '2026', '20656', '2065864', '20th', '21', '218', '22', '2200', '22k', '22kv', '23', '230', '24', '24q', '24x7', '25', '250', '2600s', '27', '28', '28xx', '29', '2900s', '2960', '29th', '29xx', '2b', '2d', '2fa', '2nd'

In [10]:
print(X_train_counts.shape)

(126, 6671)


### Training...

#### SGD

In [11]:
clf_sgd = SGDClassifier(loss='hinge',
                        penalty='l1',
                        alpha=0.001,
                        random_state=42,
                        max_iter=5,
                        tol=None)
clf_sgd.fit(X_train_tfidf, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
              random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
              verbose=0, warm_start=False)

#### MLP

In [12]:
clf_mlp = MLPClassifier(activation='relu',
                        alpha=0.004,
                        hidden_layer_sizes=(400,),
                        max_iter=300,
                        random_state=42,
                        tol=0.0001)
clf_mlp.fit(X_train_tfidf, y_train)

MLPClassifier(activation='relu', alpha=0.004, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(400,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

### Evaluation

#### SGD

In [13]:
preds_sgd = clf_sgd.predict(count_vect.transform(X_test))
accuracy_sgd = np.mean(preds_sgd==np.array(y_test))

In [14]:
print("Accuracy: {}".format(accuracy_sgd))

Accuracy: 0.813953488372093


#### MLP

In [15]:
preds_mlp = clf_mlp.predict(count_vect.transform(X_test))
accuracy_mlp = np.mean(preds_mlp==np.array(y_test))

In [16]:
print("Accuracy MLP: {}".format(accuracy_mlp))

Accuracy MLP: 0.813953488372093


### Prediction

In [17]:
with open(SAMPLE_FILENAME, 'r') as f:
    resume_sample = f.read()

In [18]:
resume_sample

"HomeAdvisor is looking for a senior-level Developer to join our Software Development team. In general, we are looking for someone who has experience working on public-facing web properties, and being part of a fast-paced, fun-loving team. Our environment can be fun, loose, iterative, and even chaotic at times - you've been warned.\n\nWhat you will be doing:\nCode every day in HTML, CSS, and to make our site highly interactive\nIntegrate client-side code with web services, social platforms, and our Java-based infrastructure\nProduce pixel-perfect layouts and sites from mock-ups provided by key business partners in Creative Design, UX/UI, and Product Development\nWork on progressive redesign and maintenance projects that have an immediate impact on how our customers experience our brand every day\n\nWhy we think you'll like working here:\nHomeAdvisor.com is the industry-leading online marketplace connecting homeowners to pre-qualified home improvement resources in their area - a homeown

In [19]:
X_pred = count_vect.transform([resume_sample])
X_pred = tfidf_transformer.transform(X_pred)

In [20]:
y_pred = clf_sgd.predict(X_pred)

In [21]:
y_pred

array(['Web Designing'], dtype='<U25')