In [1]:
import pandas as pd
import numpy  as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import spacy
import re
import time

In [2]:
data_origin = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
input_train,input_test,output_train,output_test = train_test_split(data_origin.description,data_origin.jobflag,test_size=0.2,random_state=0,stratify=data_origin.jobflag)

In [6]:
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

In [7]:
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

In [8]:
count_vect = CountVectorizer(tokenizer=custom_tokenizer,min_df=10,stop_words="english")
X_train_counts = count_vect.fit_transform(input_train)

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  after removing the cwd from sys.path.


In [10]:
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [12]:
X_train_tfidf

<2344x454 sparse matrix of type '<class 'numpy.float64'>'
	with 17368 stored elements in Compressed Sparse Row format>

In [13]:
params ={
    "C":np.arange(0.1,2,0.05),
    "kernel":["linear"]
}
grid = GridSearchCV(SVC(),params,cv=6,n_jobs=6)

In [14]:
start = time.time()
grid.fit(X_train_tfidf,output_train)
print("time",time.time()-start)

time 29.60721516609192


In [15]:
grid.best_params_

{'C': 0.7500000000000002, 'kernel': 'linear'}

In [18]:
pred = grid.predict(X_train_tfidf)
print(classification_report(output_train,pred))
X_test_counts = count_vect.transform(input_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
pred = grid.predict(X_test_tfidf)
print(classification_report(output_test,pred))

             precision    recall  f1-score   support

          1       0.73      0.70      0.71       499
          2       0.84      0.37      0.52       278
          3       0.71      0.93      0.81      1101
          4       0.77      0.51      0.62       466

avg / total       0.74      0.73      0.71      2344



from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  after removing the cwd from sys.path.


             precision    recall  f1-score   support

          1       0.65      0.58      0.61       125
          2       0.60      0.30      0.40        70
          3       0.66      0.90      0.76       275
          4       0.66      0.37      0.47       117

avg / total       0.65      0.65      0.63       587



In [17]:
pred = grid.predict(input_test)
print(classification_report(output_test,pred))

             precision    recall  f1-score   support

          1       0.64      0.56      0.60       125
          2       0.63      0.37      0.47        70
          3       0.66      0.84      0.74       275
          4       0.63      0.47      0.54       117

avg / total       0.65      0.65      0.64       587



In [18]:
X_test_counts = count_vect.transform(data_test.description)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  after removing the cwd from sys.path.


In [19]:
pred_sub = grid.predict(X_test_tfidf)

In [20]:
data_test['pre'] = pred_sub

In [21]:
data_test.to_csv('submit_svm4.csv',columns=['id','pre'],header=False,index=False)