In [1]:
import spacy
nlp = spacy.load('en')

In [2]:
import gensim
import textacy

In [3]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv("https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/research_paper.csv")

In [5]:
df.head()

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Sc...,VLDB
1,High performance prime field multiplication fo...,ISCAS
2,enchanted scissors: a scissor interface for su...,SIGGRAPH
3,Detection of channel degradation attack by Int...,INFOCOM
4,Pinning a Complex Network through the Betweenn...,ISCAS


In [6]:
df['Conference'].value_counts()

ISCAS       864
INFOCOM     515
VLDB        423
WWW         379
SIGGRAPH    326
Name: Conference, dtype: int64

In [7]:
X_temp=df['Title'].values
y=df['Conference'].values

In [8]:
def sent_tokenize(sent):
    temp=nlp(sent)
    return [str(token) for token in temp if not token.is_stop] 

In [9]:
temp_sentences=[textacy.preprocess_text(str(sent), lowercase=True, no_punct=True) for sent in X_temp] 

In [10]:
X=[sent_tokenize(sent) for sent in temp_sentences]

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [15]:
loc="./storage/cc.en.300.bin"

In [16]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format(loc)



In [21]:
def get_embedding(word):
    try:
        embedding=model[word]
    except:
        embedding=np.zeros((300,))
    return embedding

In [22]:
X_train_embeddings=[np.mean(np.array(list(map(get_embedding,tok_sent))),axis=0) for tok_sent in X_train]
X_test_embeddings=[np.mean(np.array(list(map(get_embedding,tok_sent))),axis=0) for tok_sent in X_test]

In [23]:
from sklearn.svm import SVC

In [47]:
clf=SVC(kernel='linear',C=3)

In [48]:
clf.fit(X_train_embeddings,y_train)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
predictions=clf.predict(X_test_embeddings)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
accuracy_score(y_test,predictions)

0.7941567065073041

In [52]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions, 
                                    target_names=df['Conference'].unique()))

             precision    recall  f1-score   support

       VLDB       0.80      0.77      0.79       155
      ISCAS       0.84      0.90      0.87       259
   SIGGRAPH       0.73      0.72      0.73        98
    INFOCOM       0.76      0.78      0.77       127
        WWW       0.77      0.66      0.71       114

avg / total       0.79      0.79      0.79       753



In [81]:
clf1=SVC(kernel='rbf',C=5,gamma=3)

In [82]:
clf1.fit(X_train_embeddings,y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=3, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [83]:
predictions=clf1.predict(X_test_embeddings)

In [84]:
accuracy_score(y_test,predictions)

0.8007968127490039

In [85]:
from sklearn.model_selection import GridSearchCV

In [86]:
params={'C':[0.01,0.1,1,10],'gamma':[0.01,0.1,1,5,10]}
clf_test=SVC(kernel='rbf')

In [88]:
cv=GridSearchCV(estimator=clf_test,param_grid=params,scoring='accuracy',cv=4,verbose=True)

In [89]:
cv.fit(X_train_embeddings,y_train)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  2.0min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10], 'gamma': [0.01, 0.1, 1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [99]:
pd.DataFrame(cv.cv_results_).sort_values('mean_test_score',ascending=False).head()



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
16,0.493588,0.142022,0.784493,0.850816,10,0.1,"{'C': 10, 'gamma': 0.1}",1,0.775,0.848554,0.783105,0.847264,0.792237,0.851064,0.787671,0.856383,0.003065,0.001013,0.006371,0.003492
12,0.54935,0.151442,0.77423,0.851956,1,1.0,"{'C': 1, 'gamma': 1}",2,0.770455,0.848554,0.771689,0.848784,0.785388,0.851064,0.769406,0.859422,0.003657,0.000614,0.006488,0.004421
17,0.412029,0.129017,0.77423,0.962371,10,1.0,"{'C': 10, 'gamma': 1}",2,0.781818,0.959665,0.760274,0.967325,0.789954,0.963526,0.76484,0.958967,0.008224,0.002957,0.012118,0.003346
18,0.879105,0.167835,0.751425,0.99962,10,5.0,"{'C': 10, 'gamma': 5}",4,0.772727,0.999239,0.712329,1.0,0.771689,0.99924,0.748858,1.0,0.003745,0.00165,0.024489,0.00038
13,0.674312,0.166403,0.748005,0.943748,1,5.0,"{'C': 1, 'gamma': 5}",5,0.761364,0.945967,0.728311,0.942249,0.76484,0.946049,0.737443,0.940729,0.002718,0.000894,0.0155,0.002322


In [94]:
cv.best_params_

{'C': 10, 'gamma': 0.1}

In [95]:
model=cv.best_estimator_

In [97]:
predictions=model.predict(X_test_embeddings)

In [98]:
accuracy_score(y_test,predictions)

0.7901726427622842

In [100]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions, 
                                    target_names=df['Conference'].unique()))

             precision    recall  f1-score   support

       VLDB       0.80      0.77      0.79       155
      ISCAS       0.83      0.90      0.86       259
   SIGGRAPH       0.75      0.71      0.73        98
    INFOCOM       0.73      0.78      0.75       127
        WWW       0.77      0.66      0.71       114

avg / total       0.79      0.79      0.79       753

