In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import spacy

#Load data

In [None]:
data =  fetch_20newsgroups(return_X_y=False)

In [None]:
text = data.data
text[0:4]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [None]:
label = data.target
label

array([7, 4, 4, ..., 3, 1, 8])

In [None]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
df = pd.DataFrame({'text':text,'label':label})
df.head()

Unnamed: 0,text,label
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


##Data Cleansing
https://github.com/laxmimerit/preprocess_kgptalkie

In [None]:
pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-9rw39ecv
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-9rw39ecv
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp36-none-any.whl size=11742 sha256=58a13a5332f8b2b050428b3276dde4eabe9c243171b205d6ce236dd5d70321cc
  Stored in directory: /tmp/pip-ephem-wheel-cache-96uqr1gv/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.3


In [None]:
import preprocess_kgptalkie as ps
import re

In [None]:
df.iloc[0,0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [None]:
def get_clean(x):
    x = str(x).lower().replace('\n', ' ').replace('\\', '').replace('_', ' ').replace('-', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [None]:
df['text'] = df['text'].apply(lambda x: get_clean(x))

In [None]:
df.iloc[0,0]

'from wheres my thing subject what car is this nntp posting host rac3wamumdedu organization university of maryland college park lines 15 i was wondering if anyone out there could enlighten me on this car i saw the other day it was a 2 door sports car looked to be from the late 60s early 70s it was called a bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all i know if anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please e mail thanks il brought to you by your neighborhood lerxst'

In [None]:
df.head(3)

Unnamed: 0,text,label
0,from wheres my thing subject what car is this ...,7
1,from guy kuo subject si clock poll final call ...,4
2,from thomas e willis subject pb questions orga...,4


# Text Embedding

## Load pretrained model

### Option 1: Using 3rd party fastetxt model (download)  
https://spacy.io/models

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz  #(spacy support only for text version not binary)
!python -m spacy init-model en myword2vec_model --vectors-loc cc.en.300.vec.gz

--2021-02-23 02:53:36--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2021-02-23 02:53:59 (56.5 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]



###Option 2: Using Spacy pretrained model

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_sm") #xx_ent_wiki_sm / en_core_web_lg / en_core_web_sm  /  myword2vec_model

#OR
# import en_core_web_sm
# nlp = en_core_web_sm.load()

## Check Load

In [None]:
nlp

<spacy.lang.en.English at 0x7f507b12f160>

In [None]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f507c8b4e80>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f5054d15288>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f5054d152e8>)]

In [None]:
texts = "This is a sentence."

In [None]:
doc = nlp(texts)

In [None]:
doc

This is a sentence.

In [None]:
doc.vector

array([-1.4567294 ,  0.43505168, -0.08386751, -0.68505687,  1.6865177 ,
        1.2449875 ,  2.3560495 , -0.5700079 ,  1.2270511 ,  0.3814322 ,
       -1.1986411 , -1.0562726 , -1.0949914 ,  1.3259035 , -0.997925  ,
       -0.83510274, -1.074462  , -1.1277528 ,  0.10539386, -1.863813  ,
        1.0726984 ,  0.44764823,  0.20754662, -0.04941251, -1.9854825 ,
        0.93800086, -0.35837477, -1.5851486 ,  0.81120443, -0.60871327,
        0.37828287,  2.1253457 , -0.6640552 ,  0.01595545, -0.23018654,
       -1.4106985 ,  1.3323714 , -0.5527895 , -0.712036  , -0.22105941,
        2.537019  , -0.3540569 ,  1.2667272 , -1.3999382 ,  0.24419007,
       -0.1933262 , -0.4649256 , -0.17958884, -1.1775234 ,  2.5717537 ,
        3.7531643 , -1.3961198 ,  0.14241442,  1.0708332 , -1.6227611 ,
        1.0725682 ,  3.325587  , -0.81073636, -2.1329389 ,  0.65571415,
       -1.153394  ,  1.721503  ,  2.8911927 ,  0.26956487,  0.77309257,
       -0.1512557 ,  1.1164484 , -1.0694965 ,  0.09547086, -1.02

In [None]:
len(doc.vector)

96

In [None]:
x = 'king man woman'
doc=nlp(x)
for token1 in doc:
   for token2 in doc:
      print(token1.text,token2.text,token1.similarity(token2))

king king 1.0
king man 0.4750763
king woman 0.44465914
man king 0.4750763
man man 1.0
man woman 0.5597576
woman king 0.44465914
woman man 0.5597576
woman woman 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


## Vectorization

In [None]:
df.head(3)

Unnamed: 0,text,label
0,from wheres my thing subject what car is this ...,7
1,from guy kuo subject si clock poll final call ...,4
2,from thomas e willis subject pb questions orga...,4


In [None]:
x = "This is a sentence."
doc=nlp(x)
vec = list(doc.vector)
pd.DataFrame(vec).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,-1.456729,0.435052,-0.083868,-0.685057,1.686518,1.244987,2.35605,-0.570008,1.227051,0.381432,-1.198641,-1.056273,-1.094991,1.325904,-0.997925,-0.835103,-1.074462,-1.127753,0.105394,-1.863813,1.072698,0.447648,0.207547,-0.049413,-1.985482,0.938001,-0.358375,-1.585149,0.811204,-0.608713,0.378283,2.125346,-0.664055,0.015955,-0.230187,-1.410699,1.332371,-0.55279,-0.712036,-0.221059,...,3.325587,-0.810736,-2.132939,0.655714,-1.153394,1.721503,2.891193,0.269565,0.773093,-0.151256,1.116448,-1.069497,0.095471,-1.023521,-1.361816,-0.160646,-1.283196,-0.757073,-0.603647,-0.105606,0.205019,-0.075575,-0.825418,-1.256503,0.367787,-0.999401,1.048092,-0.588075,-0.545357,-0.237046,0.249406,2.542247,0.052722,0.254,-1.250127,-0.545985,-0.268892,-0.169021,0.958067,2.346785


In [None]:
list_vec=[]
for text in (df['text'].to_list()): 
  doc=nlp(text)
  vec = list(doc.vector)
  list_vec.append(vec)

df_vector = pd.DataFrame(list_vec)

In [None]:
df = pd.concat([df.reset_index(drop=True),df_vector.reset_index(drop=True)],axis='columns')

In [None]:
del df_vector

In [None]:
df.head(3)

Unnamed: 0,text,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,from wheres my thing subject what car is this ...,7,0.077655,0.197849,0.455609,-0.459338,1.184387,0.00631,0.86798,0.137625,1.049256,1.600392,-0.525589,0.068216,-0.603169,-0.39119,-0.375517,-0.460137,-0.877205,-0.126781,-0.78308,-0.484316,0.539608,-0.053729,-0.703101,-0.145389,-1.084569,0.618235,-0.84531,-0.1467,0.762363,-0.52406,1.176953,0.512581,-0.036895,-0.986884,0.298872,-0.509206,0.483339,-0.653376,...,0.839319,0.449537,-0.219074,0.115656,1.006245,-0.351751,0.718396,0.597777,0.576871,-0.12504,0.657601,-1.686107,-0.288458,0.857688,0.000973,-0.113535,0.460092,-0.119784,-0.174186,0.206024,0.937117,-0.615225,-0.62459,-0.415851,-0.082603,-0.812176,-0.226838,0.457079,0.361726,-0.20176,0.494748,0.654604,-0.07017,0.384152,-0.942579,-0.294797,-0.320777,0.717069,0.843342,0.930658
1,from guy kuo subject si clock poll final call ...,4,0.31853,0.182307,0.007614,0.248199,1.139667,0.397077,0.387349,0.403482,1.116422,1.738893,-0.318881,0.368688,-0.324289,-0.817805,-0.71962,-0.473957,-0.520252,0.528681,-0.279207,-0.500989,0.745663,-0.020518,-0.900491,-0.020902,-0.813311,0.289252,-1.285959,-0.505627,1.344291,-0.99717,1.006735,-0.394003,0.186143,-1.093247,-0.114128,-1.018947,0.933355,-0.600451,...,0.721977,1.052994,0.461421,0.255367,1.333428,-0.411155,0.574862,0.389015,1.006845,-0.191505,1.187805,-1.26234,-0.119141,1.002082,-0.075516,-0.116321,0.618113,-0.141525,-0.412978,-0.462293,0.908971,-0.664411,-0.480926,0.065135,0.034922,-0.909471,-0.330521,-0.055253,0.837253,0.225572,0.236928,-0.10977,-0.210719,0.724598,-0.681809,-0.432472,-0.24767,0.883211,0.571737,0.777733
2,from thomas e willis subject pb questions orga...,4,0.013142,0.485057,0.25083,-0.586312,1.008231,0.035873,0.98305,0.245288,1.199011,1.54029,-0.451316,0.35857,-0.331952,-0.151974,-0.468698,-0.028452,-0.816371,-0.271829,-0.738057,-0.446966,0.308214,0.168768,-0.701461,-0.416298,-0.890137,0.964958,-0.837852,0.070418,0.67306,-0.847352,1.226399,0.172277,-0.498637,-0.917257,0.355845,-0.429293,0.409933,-0.775252,...,0.84139,0.698435,0.280942,-0.063632,1.341766,-0.414044,0.96706,0.263698,0.64285,-0.294584,0.937929,-1.452954,-0.109534,0.666966,-0.159549,0.171083,0.660407,-0.358718,-0.331351,-0.080756,0.771206,-0.37928,-0.745646,-0.622032,-0.42804,-0.805689,-0.032668,0.205862,0.581388,-0.200987,0.531862,0.356894,-0.350803,0.327116,-1.036934,-0.119855,-0.296644,0.767847,0.618551,0.726817


# MODELING

In [None]:
X= df.drop(['text','label'],axis='columns')
y = df['label']

In [None]:
# split train test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X, y,test_size=0.2,stratify=y,random_state=123)
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train,y_train,eval_metric='mlogloss', verbose=False, eval_set=[( X_train, y_train), ( X_test, y_test)], early_stopping_rounds=10,
        #sample_weight=w_array #Used when class_weight imbalance handling applied
        )

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = model.predict(X_test)
y_proba_pred = model.predict_proba(X_test)#[:,1]

#evaluasi model
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
print(confusion_matrix(y_test,y_pred))
print('AUC Score : %0.15f' %roc_auc_score(y_test,y_proba_pred,multi_class='ovr'))
print(classification_report(y_test,y_pred))

[[25  2  2  0  0  2  0  1  2  2  4  6  0  8  2 21  5  9  3  2]
 [ 3 18 11 13  8  9 11  4  5  2  4  3  6  4  9  4  1  1  0  1]
 [ 0  4 33 10  6  9 13  3  6  6  5  7  4  2  5  1  2  1  1  0]
 [ 2  4  6 36  9 10 10  6  5  7  4  1  5  4  5  3  1  0  0  0]
 [ 0  4 13 14 25 16  8  5  5  6  2  1  5  1  6  1  3  0  0  0]
 [ 0  6  3  7  9 46 12  1  5  2  4  4  3  4  7  2  0  4  0  0]
 [ 0  3  8  6  2  6 74  3  2  2  1  2  2  1  2  2  1  0  0  0]
 [ 3  3  3  7  6  5  5 20 10 19  9  5  5  0  2  5  7  1  2  2]
 [ 1  2  6  7  7  2  7  7 39  9  4  6  6  2  5  4  2  3  1  0]
 [ 1  1  7  2  3  6  3  3  4 44 24  0  0  2  3  5  1  9  0  1]
 [ 4  1  5  5  1  3  4  3  2 23 46  3  0  3  2  4  7  2  2  0]
 [ 3  2  4  3  0  3  4  6  5  7  4 47  4  3  5  6  6  3  3  1]
 [ 1  3  9  4 12 10  9  8  7  3  6 11 14  3 14  1  0  0  3  0]
 [ 4  6  4  3  3  6  5  6  5  4  3 14  2 31  6  3  3  2  7  2]
 [ 1  5  6  2  2  5  5  2  5 10  5 13  3  8 25  5  6  4  6  1]
 [11  2  0  0  5  3  1  4  3  9  7  6  2  2  2 46  2 13