## Utils

In [1]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense



## Loading Data Using Pandas:

In [35]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',None)
df = pd.read_csv('resume_data2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,cv_txt,score
0,0,"Your Name November 13, 2019 Short description ...",7
1,1,Last Updated on 21st June 2019 Debarghya Das d...,6
2,2,DougHeffernan Passionate Driver EDUCATION Marc...,9
3,3,Jon Snow HOUSE Stark REALM Kingdom of The Nort...,9
4,4,"Experience Sep/2015 Present Job 4 Employer, Co...",0


In [36]:
df.drop(['Unnamed: 0'],inplace=True,axis=1)

### Some Basic Exploratory Data Analysis:

In [37]:
df.head()

Unnamed: 0,cv_txt,score
0,"Your Name November 13, 2019 Short description ...",7
1,Last Updated on 21st June 2019 Debarghya Das d...,6
2,DougHeffernan Passionate Driver EDUCATION Marc...,9
3,Jon Snow HOUSE Stark REALM Kingdom of The Nort...,9
4,"Experience Sep/2015 Present Job 4 Employer, Co...",0


In [38]:
df.shape

(319, 2)

In [39]:
df.isna().sum()

cv_txt    0
score     0
dtype: int64

### Data Preprocessing:

In [43]:
for i in range(1,3):
    print(df['cv_txt'].values[i])
    print('-----------------------------------------------------')

Last Updated on 21st June 2019 Debarghya Das debarghyadas.com| fb.co/dd deedy@fb.com | 607.379.5733 | dd367@cornell.edu EDUCATION CORNELL UNIVERSITY MENG IN COMPUTER SCIENCE Dec 2014 | Ithaca, NY CORNELL UNIVERSITY BS IN COMPUTER SCIENCE May 2014 | Ithaca, NY College of Engineering Magna Cum Laude Cum. GPA: 3.83 / 4.0 Major GPA: 3.9 / 4.0 LA MARTINIERE FOR BOYS Grad. May 2011| Kolkata, India LINKS Facebook:// dd Github:// deedydas LinkedIn:// debarghyadas YouTube:// DeedyDash007 Twitter:// @debarghya_das Quora:// Debarghya-Das COURSEWORK GRADUATE Advanced Machine Learning Open Source Software Engineering Advanced Interactive Graphics Compilers + Practicum Cloud Computing Evolutionary Computation Defending Computer Networks Machine Learning UNDERGRADUATE Information Retrieval Operating Systems Artificial Intelligence + Practicum Functional Programming Computer Graphics + Practicum (Research Asst. & Teaching Asst 2x) Unix Tools and Scripting SKILLS PROGRAMMING Over 5000 lines: Java • She

In [46]:
df.isna().sum()

cv_txt    0
score     0
dtype: int64

In [47]:
df_x = df['cv_txt']
df_y = df['score']

In [48]:
stop_words = set(stopwords.words('english'))
len(stop_words) #finding stop words

179

In [49]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
snow = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['cv_txt'][i])
    review = review.lower()
    review = review.split()
    
    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [50]:
corpus[1]

'last updat st june debarghya das debarghyada com fb co dd deedi fb com dd cornel edu educ cornel univers meng comput scienc dec ithaca ny cornel univers bs comput scienc may ithaca ny colleg engin magna cum laud cum gpa major gpa la martinier boy grad may kolkata india link facebook dd github deedyda linkedin debarghyada youtub deedydash twitter debarghya das quora debarghya das coursework graduat advanc machin learn open sourc softwar engin advanc interact graphic compil practicum cloud comput evolutionari comput defend comput network machin learn undergradu inform retriev oper system artifici intellig practicum function program comput graphic practicum research asst teach asst x unix tool script skill program line java shell python javascript ocaml matlab rail latex line c c css php assembl familiar io android mysql experi facebook softwar engin jan present new york ny coursera kpcb fellow softwar engin intern june sep mountain view ca applic chosen kpcb fellow led ship yoda admin i

In [51]:
df_x = corpus

In [52]:
type(df_x)

list

In [53]:
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
type(onehot_repr)

list

In [54]:
sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1730 4180   32]
 [   0    0    0 ... 1055 2683 2402]
 [   0    0    0 ... 4087  530  616]
 ...
 [   0    0    0 ... 1551  244 4276]
 [1645 1314 1686 ...  244  244 4276]
 [   0    0    0 ... 1647  244 4276]]


In [28]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 40)           200000    
_________________________________________________________________
dropout (Dropout)            (None, 400, 40)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [55]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(df_y)

### Train Test Split:

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [35]:
#we are feeding the 
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=100,batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x282f316fe10>

In [36]:
y_pred_lstm = model.predict_classes(X_test)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pred_lstm)

0.109375

### TF-IDF:

In [57]:
final_tf = df_x
tf_idf = TfidfVectorizer(ngram_range=(1,2),max_features=5000)
tf_data = tf_idf.fit_transform(final_tf)
print(tf_data[1])

  (0, 3477)	0.03451900530181986
  (0, 4644)	0.04553529853589914
  (0, 2397)	0.08365062460886882
  (0, 3075)	0.04720353200871461
  (0, 4332)	0.04412482723197255
  (0, 1253)	0.04720353200871461
  (0, 1207)	0.030508993769107727
  (0, 3178)	0.04412482723197255
  (0, 2677)	0.04412482723197255
  (0, 1820)	0.0882496544639451
  (0, 4786)	0.09440706401742922
  (0, 2804)	0.09440706401742922
  (0, 1490)	0.0882496544639451
  (0, 4989)	0.04412482723197255
  (0, 2897)	0.03998918509561814
  (0, 2237)	0.04086126865526724
  (0, 3421)	0.03919303518245176
  (0, 3977)	0.03324019025330949
  (0, 258)	0.037782563878525174
  (0, 2997)	0.02305628441298117
  (0, 741)	0.04086126865526724
  (0, 4023)	0.10680458462718473
  (0, 4063)	0.04553529853589914
  (0, 2990)	0.06814515529412091
  (0, 2547)	0.06648038050661897
  :	:
  (0, 1000)	0.07997837019123628
  (0, 2553)	0.035483048950987035
  (0, 1482)	0.08648703679197389
  (0, 775)	0.013359417434252
  (0, 2676)	0.10327832007982579
  (0, 2946)	0.2509518738266065
  (0, 2

In [94]:
pickle.dump(tf_idf, open("tfidf_vectorizer.pkl", "wb"))

In [95]:
v = pickle.load(open('tfidf_vectorizer.pkl','rb'))

In [58]:
tf_data.get_shape()

(319, 5000)

In [59]:
features = tf_idf.get_feature_names()
features[:10]

['aa',
 'ab',
 'abap',
 'abap kolkata',
 'abc',
 'abil',
 'abil quick',
 'abil work',
 'abl',
 'abroad']

In [62]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
df_y1 = encode.fit_transform(df_y)
type(df_y1)

numpy.ndarray

In [60]:
# pd.DataFrame(['df_y1'])

In [63]:
SEED =42
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(tf_data,df_y,test_size = 0.2,random_state=SEED)

In [64]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB().fit(x_train, y_train)

y_pred_nb=nb_model.predict(x_val)

In [65]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred_nb)

0.09375

### Random Forest:

In [73]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(x_train,y_train)
y_pred_rf = rf_model.predict(x_val)
accuracy_score(y_val,y_pred_rf)

0.125

In [74]:
import pickle
pickle.dump(rf_model,open('rf_score_model.pkl','wb'))

In [90]:
#TESTING WITH SOME DATE
test = 'last updat st june debarghya das debarghyada com fb co dd deedi fb com dd cornel edu educ cornel univers meng comput scienc dec ithaca ny cornel univers bs comput scienc may ithaca ny colleg engin magna cum laud cum gpa major gpa la martinier boy grad may kolkata india link facebook dd github deedyda linkedin debarghyada youtub deedydash twitter debarghya das quora debarghya das coursework graduat advanc machin learn open sourc softwar engin advanc interact graphic compil practicum cloud comput evolutionari comput defend comput network machin learn undergradu inform retriev oper system artifici intellig practicum function program comput graphic practicum research asst teach asst x unix tool script skill program line java shell python javascript ocaml matlab rail latex line c c css php assembl familiar io android mysql experi facebook softwar engin jan present new york ny coursera kpcb fellow softwar engin intern june sep mountain view ca applic chosen kpcb fellow led ship yoda admin interfac new phoenix platform full stack develop wrote review code js use backbon jade stylus requir scala use play googl softwar engin intern may aug mountain view ca work youtub caption team javascript python plan design develop full stack add edit automat speech recognit caption product creat backbon js like framework caption editor phabric open sourc contributor team leader jan may palo alto ca ithaca ny phabric use daili facebook dropbox quora asana creat meme generat php shell led team mit cornel ic london uhelsinki project research cornel robot learn lab research jan jan ithaca ny work ashesh jain prof ashutosh saxena creat planit tool learn larg scale user prefer feedback plan robot trajectori human environ cornel phonet lab head undergradu research mar may ithaca ny led develop quicktongu first ever breakthrough tongu control game prof sam tilsen aid linguist research award top kpcb engin fellow st microsoft code competit cornel nation jump trade challeng finalist th cs cach race bot tournament nd cs biannual intra class bot tournament nation indian nation mathemat olympiad inmo finalist public jain das saxena planit crowdsourc approach learn plan path larg scale prefer feedback tech report icra press tilsen das b mckee real time articulatori biofeedback electromagnet articulographi linguist vanguard press'
snow = nltk.stem.SnowballStemmer('english')

corpus_test = []
# for i in range(0, len(df)):
review = re.sub('[^a-zA-Z]', ' ', test)
review = review.lower()
review = review.split()
    
review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
corpus_test.append(review)

In [96]:
final_tf_test = corpus_test
# tf_idf_test = TfidfVectorizer(ngram_range=(1,2),max_features=5000)
tf_data_test = v.transform(final_tf_test)
tf_data_test.get_shape()

(1, 5000)

In [97]:
loaded_model = pickle.load(open('rf_score_model.pkl', 'rb'))
result = loaded_model.predict(tf_data_test)
print(result)

[6]


### rf with rscv:

In [45]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]



random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

cls_rf_rscv = RandomForestClassifier()
from sklearn.model_selection import RandomizedSearchCV

random_search_rf = RandomizedSearchCV(cls_rf_rscv, random_grid,n_iter=5, n_jobs=1, cv=5,verbose=2)
random_search_rf.fit(x_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False, total=   6.0s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False, total=   6.1s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False, total=   7.5s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False, total=   6.5s
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=False, total=   6.4s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=80, bootstrap=True 
[CV]  n

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.1min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5, n_jobs=1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=2)

In [46]:
random_search_rf.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [50]:
base_model = RandomForestClassifier(n_estimators= 2000,
                                     min_samples_split= 10,
                                     min_samples_leaf= 1,
                                     max_features= 'sqrt',
                                     max_depth= 90,
                                     bootstrap= True,
                                    random_state = SEED)
base_model.fit(x_train, y_train)

RandomForestClassifier(max_depth=90, max_features='sqrt', min_samples_split=10,
                       n_estimators=2000, random_state=42)

In [51]:
y_pred_rf_rscv = base_model.predict(x_val)

In [52]:
# y_pred_rf_rscv = rf_rscv_model.predict(x_val)
accuracy_score(y_val,y_pred_rf_rscv)

0.078125