In [119]:
# Lexical Features - features from words, without "context".
# Example -> Parts of Speech (POS
# Named Entities
# Syntactic dependencies - relation between words


import pandas as pd
import numpy as np
PATH = 'data/'
df = pd.read_csv(PATH + 'data.csv')
df = df.sample(5000, random_state=42)
#df.text = df.text_clean.astype(str)
df = df[['id', 'text', 'target']]
print(df.isnull().sum())


import spacy
nlp = spacy.load("en_core_web_sm")

df['spacy_doc'] = df['text'].apply(nlp)


def POS_count(x):
    doc = x['spacy_doc']
    N = len([token for token in doc])
    propn_cnt = len([token for token in doc if token.pos_ in ['PROPN']])/N
    noun_cnt = len([token for token in doc if token.pos_ in ['NOUN']])/N
    pron_cnt = len([token for token in doc if token.pos_ in ['PRON']])/N
    num_cnt = len([token for token in doc if token.pos_ in ['NUM']])/N
    verb_cnt = len([token for token in doc if token.pos_ in ['VERB']])/N
    symb_cnt = len([token for token in doc if token.pos_ in ['SYM']])/N
    adp_cnt = len([token for token in doc if token.pos_ in ['ADP']])/N
    adv_cnt = len([token for token in doc if token.pos_ in ['ADV']])/N
    punct_cnt = len([token for token in doc if token.pos_ in ['PUNCT']])/N
    adj_cnt = len([token for token in doc if token.pos_ in ['ADJ']])/N
    aux_cnt = len([token for token in doc if token.pos_ in ['AUX']])/N
    conj_cnt = len([token for token in doc if token.pos_ in ['CONJ', 'CCONJ']])/N
    det_cnt = len([token for token in doc if token.pos_ in ['DET']])/N

    return propn_cnt, noun_cnt, pron_cnt, num_cnt, verb_cnt, symb_cnt, \
adp_cnt, adv_cnt, punct_cnt, adj_cnt, aux_cnt, conj_cnt, det_cnt


def NER_count(x):
    doc = x['spacy_doc']
    N = len([token for token in doc])
    gpe_cnt = len([ent for ent in doc.ents if ent.label_ in ['GPE']])/N
    org_cnt = len([ent for ent in doc.ents if ent.label_ in ['ORG']])/N
    mon_cnt = len([ent for ent in doc.ents if ent.label_ in ['MONEY']])/N
    per_cnt = len([ent for ent in doc.ents if ent.label_ in ['PERSON']])/N
    fac_cnt = len([ent for ent in doc.ents if ent.label_ in ['FAC']])/N
    loc_cnt = len([ent for ent in doc.ents if ent.label_ in ['LOC']])/N
    date_cnt = len([ent for ent in doc.ents if ent.label_ in ['DATE']])/N
    evt_cnt = len([ent for ent in doc.ents if ent.label_ in ['EVENT']])/N
    prod_cnt = len([ent for ent in doc.ents if ent.label_ in ['PRODUCT']])/N
    time_cnt = len([ent for ent in doc.ents if ent.label_ in ['TIME',]])/N
    perc_cnt = len([ent for ent in doc.ents if ent.label_ in ['PERCENT']])/N
    quant_cnt = len([ent for ent in doc.ents if ent.label_ in ['QUANTITY',]])/N
    ord_cnt = len([ent for ent in doc.ents if ent.label_ in ['ORDINAL',]])/N
    card_cnt = len([ent for ent in doc.ents if ent.label_ in ['CARDINAL']])/N
    norp_cnt = len([ent for ent in doc.ents if ent.label_ in ['NORP']])/N
    return gpe_cnt, org_cnt, mon_cnt, per_cnt, fac_cnt, loc_cnt, date_cnt, \
evt_cnt, prod_cnt, time_cnt, perc_cnt, ord_cnt, card_cnt, norp_cnt

df[['propn_cnt','noun_cnt','pron_cnt',' num_cnt','verb_cnt','symb_cnt','adp_cnt','adv_cnt',' punct_cnt','adj_cnt',' aux_cnt','conj_cnt',' det_cnt']] = df.apply(POS_count, axis=1, result_type='expand')
df[['gpe_cnt','org_cnt','mon_cnt','per_cnt','fac_cnt','loc_cnt','date_cnt','evt_cnt','prod_cnt','time_cnt','perc_cnt','ord_cnt','card_cnt','norp_cnt']] = df.apply(NER_count, axis=1, result_type='expand')

print(df.head())
print(df.columns)
#df.drop('text', axis=1)
#df.to_csv(PATH + 'spacy_features.csv')
print(df.shape)
print(df.dtypes)


id           0
text         0
target    1478
dtype: int64
         id                                               text  target  \
10219  8687  Watch This Giant Sinkhole Form on New York Cit...     NaN   
1413   2039  @AlcoholAndMetal + do anything to fix that. Of...     1.0   
8887   4192  absolute drown your Wagyu steak in heaping pil...     NaN   
4505   6403  AD Miles 'Hurricane of Fun : The Making of Wet...     0.0   
8765   3799  @HJudeBoudreaux Start your car with it! Or use...     NaN   

                                               spacy_doc  propn_cnt  noun_cnt  \
10219  (Watch, This, Giant, Sinkhole, Form, on, New, ...   0.636364  0.090909   
1413   (@AlcoholAndMetal, +, do, anything, to, fix, t...   0.038462  0.115385   
8887   (absolute, drown, your, Wagyu, steak, in, heap...   0.307692  0.230769   
4505   (AD, Miles, ', Hurricane, of, Fun, :, The, Mak...   0.428571  0.071429   
8765   (@HJudeBoudreaux, Start, your, car, with, it, ...   0.000000  0.117647   

       pro

In [126]:
pd.set_option('max_colwidth', -1)
df[df.target == 0].sample(10)

  """Entry point for launching an IPython kernel.


Unnamed: 0,id,text,target,spacy_doc,propn_cnt,noun_cnt,pron_cnt,num_cnt,verb_cnt,symb_cnt,...,fac_cnt,loc_cnt,date_cnt,evt_cnt,prod_cnt,time_cnt,perc_cnt,ord_cnt,card_cnt,norp_cnt
1092,1577,E-Hutch is da bomb ?? http://t.co/aqmpxzo3V1,0.0,"(E, -, Hutch, is, da, bomb, ?, ?, http://t.co/aqmpxzo3V1)",0.222222,0.222222,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5138,7328,US Navy Sidelines 3 Newest #Subs http://t.co/9WQixGMHfh,0.0,"(US, Navy, Sidelines, 3, Newest, #, Subs, http://t.co/9WQixGMHfh)",0.5,0.25,0.0,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
2025,2908,Danger of union bears http://t.co/lhdcpNZx6A,0.0,"(Danger, of, union, bears, http://t.co/lhdcpNZx6A)",0.2,0.4,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1616,2334,Ashes 4th Test: 10 Hilarious Twitter Reactions to Australia's collapse http://t.co/6DznEjuVD3 by @Absolut_Sumya15,0.0,"(Ashes, 4th, Test, :, 10, Hilarious, Twitter, Reactions, to, Australia, 's, collapse, http://t.co/6DznEjuVD3, by, @Absolut_Sumya15)",0.4,0.2,0.0,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0
2018,2896,Thank you @RicharkKirkArch @AusInstArchitect for words of warning re #QueensWharf #Brisbane http://t.co/jMkYWhv7mP via @FinancialReview,0.0,"(Thank, you, @RicharkKirkArch, @AusInstArchitect, for, words, of, warning, re, #, QueensWharf, #, Brisbane, http://t.co/jMkYWhv7mP, via, @FinancialReview)",0.1875,0.25,0.0625,0.0,0.0625,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4890,6962,Bad day,0.0,"(Bad, day)",0.0,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5338,7619,World Class Tgirl Ass 02 - Scene 4 - Pandemonium http://t.co/HzHoa6VZAS,0.0,"(World, Class, Tgirl, Ass, 02, -, Scene, 4, -, Pandemonium, http://t.co/HzHoa6VZAS)",0.454545,0.090909,0.0,0.181818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0
4640,6597,I presume my timeline will be inundated with 'soggy bottom' &amp; lashings of 'moist' tweets now! :-D,0.0,"(I, presume, my, timeline, will, be, inundated, with, ', soggy, bottom, ', &, amp, ;, lashings, of, ', moist, ', tweets, now, !, :-D)",0.041667,0.208333,0.041667,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6222,8881,If you wanna smoke cigs that's your own problem but when your breath smells like an old ash tray.. that's fucking disgusting,0.0,"(If, you, wanna, smoke, cigs, that, 's, your, own, problem, but, when, your, breath, smells, like, an, old, ash, tray, .., that, 's, fucking, disgusting)",0.0,0.2,0.04,0.0,0.16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
754,1088,i hate people who tweet 'receipts' but KNOW its wrong\n\nbut they wont take it down bc it 'blew up'\nliterally gtfo you're that desperate,0.0,"(i, hate, people, who, tweet, ', receipts, ', but, KNOW, its, wrong, \n\n, but, they, wo, nt, take, it, down, bc, it, ', blew, up, ', \n, literally, gtfo, you, 're, that, desperate)",0.030303,0.090909,0.181818,0.0,0.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
df.corr()['target'].sort_values(ascending = False)

target        1.000000
gpe_cnt       0.242014
adp_cnt       0.183560
noun_cnt      0.175297
loc_cnt       0.140878
 num_cnt      0.108499
card_cnt      0.100191
propn_cnt     0.096527
norp_cnt      0.095876
fac_cnt       0.078438
symb_cnt      0.070534
ord_cnt       0.067297
id            0.059870
adj_cnt       0.049653
org_cnt       0.042840
date_cnt      0.039911
mon_cnt       0.037816
prod_cnt      0.033350
evt_cnt       0.032387
perc_cnt      0.026350
time_cnt      0.021011
per_cnt      -0.008416
 punct_cnt   -0.071999
conj_cnt     -0.098483
verb_cnt     -0.101767
adv_cnt      -0.110985
 aux_cnt     -0.115530
 det_cnt     -0.144649
pron_cnt     -0.273631
Name: target, dtype: float64

In [122]:
temp = df[df.target.notna()]
x, y = temp.drop(['target', 'id', 'text', 'spacy_doc'], axis=1), temp['target']
x.fillna('None', inplace=True)
x.isnull().sum()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

text_features = []
cat_features = []

from catboost import CatBoostClassifier
model = CatBoostClassifier(depth = 3, eval_metric='F1')
model.fit(x_train, y_train, eval_set=(x_test, y_test), cat_features=cat_features, text_features=text_features)


Learning rate set to 0.039599
0:	learn: 0.3752696	test: 0.3440860	best: 0.3440860 (0)	total: 1.96ms	remaining: 1.96s
1:	learn: 0.3846154	test: 0.3516874	best: 0.3516874 (1)	total: 5.16ms	remaining: 2.58s
2:	learn: 0.3863636	test: 0.3591549	best: 0.3591549 (2)	total: 9.82ms	remaining: 3.26s
3:	learn: 0.3793844	test: 0.3546099	best: 0.3591549 (2)	total: 11.7ms	remaining: 2.91s
4:	learn: 0.3817012	test: 0.3552398	best: 0.3591549 (2)	total: 13.4ms	remaining: 2.67s
5:	learn: 0.3828571	test: 0.3487544	best: 0.3591549 (2)	total: 15.2ms	remaining: 2.52s
6:	learn: 0.3819742	test: 0.3487544	best: 0.3591549 (2)	total: 19.8ms	remaining: 2.81s
7:	learn: 0.4216366	test: 0.3897436	best: 0.3897436 (7)	total: 23.8ms	remaining: 2.96s
8:	learn: 0.4310226	test: 0.3952300	best: 0.3952300 (8)	total: 27.6ms	remaining: 3.04s
9:	learn: 0.4339493	test: 0.4000000	best: 0.4000000 (9)	total: 30.9ms	remaining: 3.05s
10:	learn: 0.4296500	test: 0.4000000	best: 0.4000000 (9)	total: 34.3ms	remaining: 3.08s
11:	learn: 0

118:	learn: 0.6158940	test: 0.5955679	best: 0.5955679 (118)	total: 369ms	remaining: 2.73s
119:	learn: 0.6181818	test: 0.5977961	best: 0.5977961 (119)	total: 372ms	remaining: 2.73s
120:	learn: 0.6200441	test: 0.6005510	best: 0.6005510 (120)	total: 375ms	remaining: 2.72s
121:	learn: 0.6204620	test: 0.6024759	best: 0.6024759 (121)	total: 377ms	remaining: 2.71s
122:	learn: 0.6203857	test: 0.5986207	best: 0.6024759 (121)	total: 380ms	remaining: 2.71s
123:	learn: 0.6230050	test: 0.5997249	best: 0.6024759 (121)	total: 386ms	remaining: 2.73s
124:	learn: 0.6234195	test: 0.6005510	best: 0.6024759 (121)	total: 393ms	remaining: 2.75s
125:	learn: 0.6223930	test: 0.6024759	best: 0.6024759 (121)	total: 399ms	remaining: 2.77s
126:	learn: 0.6231487	test: 0.6024759	best: 0.6024759 (121)	total: 403ms	remaining: 2.77s
127:	learn: 0.6245884	test: 0.6033058	best: 0.6033058 (127)	total: 405ms	remaining: 2.76s
128:	learn: 0.6239035	test: 0.6024759	best: 0.6033058 (127)	total: 409ms	remaining: 2.76s
129:	learn

234:	learn: 0.6617100	test: 0.6208719	best: 0.6275033 (195)	total: 739ms	remaining: 2.4s
235:	learn: 0.6617100	test: 0.6190476	best: 0.6275033 (195)	total: 742ms	remaining: 2.4s
236:	learn: 0.6617100	test: 0.6190476	best: 0.6275033 (195)	total: 744ms	remaining: 2.4s
237:	learn: 0.6634820	test: 0.6245059	best: 0.6275033 (195)	total: 749ms	remaining: 2.4s
238:	learn: 0.6631300	test: 0.6253298	best: 0.6275033 (195)	total: 754ms	remaining: 2.4s
239:	learn: 0.6620763	test: 0.6253298	best: 0.6275033 (195)	total: 779ms	remaining: 2.46s
240:	learn: 0.6617179	test: 0.6245059	best: 0.6275033 (195)	total: 782ms	remaining: 2.46s
241:	learn: 0.6624271	test: 0.6226913	best: 0.6275033 (195)	total: 786ms	remaining: 2.46s
242:	learn: 0.6617179	test: 0.6226913	best: 0.6275033 (195)	total: 790ms	remaining: 2.46s
243:	learn: 0.6610080	test: 0.6226913	best: 0.6275033 (195)	total: 820ms	remaining: 2.54s
244:	learn: 0.6652587	test: 0.6246719	best: 0.6275033 (195)	total: 824ms	remaining: 2.54s
245:	learn: 0.6

347:	learn: 0.6947040	test: 0.6232073	best: 0.6328125 (273)	total: 1.3s	remaining: 2.44s
348:	learn: 0.6939834	test: 0.6232073	best: 0.6328125 (273)	total: 1.31s	remaining: 2.44s
349:	learn: 0.6939834	test: 0.6232073	best: 0.6328125 (273)	total: 1.31s	remaining: 2.44s
350:	learn: 0.6943435	test: 0.6250000	best: 0.6328125 (273)	total: 1.32s	remaining: 2.44s
351:	learn: 0.6974093	test: 0.6214099	best: 0.6328125 (273)	total: 1.33s	remaining: 2.46s
352:	learn: 0.6970482	test: 0.6232073	best: 0.6328125 (273)	total: 1.34s	remaining: 2.45s
353:	learn: 0.6977226	test: 0.6232073	best: 0.6328125 (273)	total: 1.34s	remaining: 2.45s
354:	learn: 0.6977226	test: 0.6214099	best: 0.6328125 (273)	total: 1.34s	remaining: 2.44s
355:	learn: 0.6977226	test: 0.6222222	best: 0.6328125 (273)	total: 1.35s	remaining: 2.44s
356:	learn: 0.6980839	test: 0.6222222	best: 0.6328125 (273)	total: 1.35s	remaining: 2.43s
357:	learn: 0.7015003	test: 0.6240209	best: 0.6328125 (273)	total: 1.35s	remaining: 2.42s
358:	learn:

487:	learn: 0.7210744	test: 0.6363636	best: 0.6398964 (472)	total: 1.86s	remaining: 1.96s
488:	learn: 0.7217346	test: 0.6363636	best: 0.6398964 (472)	total: 1.87s	remaining: 1.96s
489:	learn: 0.7214470	test: 0.6363636	best: 0.6398964 (472)	total: 1.88s	remaining: 1.96s
490:	learn: 0.7214470	test: 0.6363636	best: 0.6398964 (472)	total: 1.88s	remaining: 1.95s
491:	learn: 0.7221074	test: 0.6363636	best: 0.6398964 (472)	total: 1.89s	remaining: 1.95s
492:	learn: 0.7221074	test: 0.6363636	best: 0.6398964 (472)	total: 1.89s	remaining: 1.95s
493:	learn: 0.7227672	test: 0.6355383	best: 0.6398964 (472)	total: 1.9s	remaining: 1.94s
494:	learn: 0.7217346	test: 0.6345904	best: 0.6398964 (472)	total: 1.91s	remaining: 1.95s
495:	learn: 0.7245478	test: 0.6371912	best: 0.6398964 (472)	total: 1.91s	remaining: 1.94s
496:	learn: 0.7252066	test: 0.6345904	best: 0.6398964 (472)	total: 1.92s	remaining: 1.95s
497:	learn: 0.7262397	test: 0.6363636	best: 0.6398964 (472)	total: 1.93s	remaining: 1.94s
498:	learn:

619:	learn: 0.7457627	test: 0.6358974	best: 0.6423927 (509)	total: 2.44s	remaining: 1.49s
620:	learn: 0.7453799	test: 0.6350832	best: 0.6423927 (509)	total: 2.44s	remaining: 1.49s
621:	learn: 0.7453799	test: 0.6350832	best: 0.6423927 (509)	total: 2.44s	remaining: 1.48s
622:	learn: 0.7453799	test: 0.6350832	best: 0.6423927 (509)	total: 2.44s	remaining: 1.48s
623:	learn: 0.7460236	test: 0.6368286	best: 0.6423927 (509)	total: 2.45s	remaining: 1.48s
624:	learn: 0.7460236	test: 0.6368286	best: 0.6423927 (509)	total: 2.46s	remaining: 1.47s
625:	learn: 0.7460236	test: 0.6368286	best: 0.6423927 (509)	total: 2.46s	remaining: 1.47s
626:	learn: 0.7478172	test: 0.6360153	best: 0.6423927 (509)	total: 2.47s	remaining: 1.47s
627:	learn: 0.7474333	test: 0.6360153	best: 0.6423927 (509)	total: 2.48s	remaining: 1.47s
628:	learn: 0.7478172	test: 0.6342711	best: 0.6423927 (509)	total: 2.48s	remaining: 1.46s
629:	learn: 0.7482014	test: 0.6342711	best: 0.6423927 (509)	total: 2.48s	remaining: 1.46s
630:	learn

756:	learn: 0.7595459	test: 0.6298201	best: 0.6423927 (509)	total: 3s	remaining: 962ms
757:	learn: 0.7601857	test: 0.6298201	best: 0.6423927 (509)	total: 3s	remaining: 958ms
758:	learn: 0.7595459	test: 0.6280566	best: 0.6423927 (509)	total: 3s	remaining: 953ms
759:	learn: 0.7595459	test: 0.6264442	best: 0.6423927 (509)	total: 3s	remaining: 949ms
760:	learn: 0.7595459	test: 0.6264442	best: 0.6423927 (509)	total: 3.01s	remaining: 944ms
761:	learn: 0.7591542	test: 0.6282051	best: 0.6423927 (509)	total: 3.02s	remaining: 942ms
762:	learn: 0.7595459	test: 0.6282051	best: 0.6423927 (509)	total: 3.02s	remaining: 938ms
763:	learn: 0.7595459	test: 0.6264442	best: 0.6423927 (509)	total: 3.03s	remaining: 937ms
764:	learn: 0.7618557	test: 0.6282051	best: 0.6423927 (509)	total: 3.04s	remaining: 933ms
765:	learn: 0.7618557	test: 0.6282051	best: 0.6423927 (509)	total: 3.04s	remaining: 928ms
766:	learn: 0.7624936	test: 0.6290116	best: 0.6423927 (509)	total: 3.04s	remaining: 924ms
767:	learn: 0.7612171	

880:	learn: 0.7725410	test: 0.6342711	best: 0.6423927 (509)	total: 3.52s	remaining: 475ms
881:	learn: 0.7721454	test: 0.6334610	best: 0.6423927 (509)	total: 3.52s	remaining: 471ms
882:	learn: 0.7721454	test: 0.6334610	best: 0.6423927 (509)	total: 3.52s	remaining: 467ms
883:	learn: 0.7721454	test: 0.6334610	best: 0.6423927 (509)	total: 3.52s	remaining: 462ms
884:	learn: 0.7721454	test: 0.6334610	best: 0.6423927 (509)	total: 3.53s	remaining: 459ms
885:	learn: 0.7721454	test: 0.6334610	best: 0.6423927 (509)	total: 3.54s	remaining: 455ms
886:	learn: 0.7727738	test: 0.6326531	best: 0.6423927 (509)	total: 3.55s	remaining: 452ms
887:	learn: 0.7731695	test: 0.6326531	best: 0.6423927 (509)	total: 3.55s	remaining: 448ms
888:	learn: 0.7731695	test: 0.6326531	best: 0.6423927 (509)	total: 3.56s	remaining: 444ms
889:	learn: 0.7731695	test: 0.6326531	best: 0.6423927 (509)	total: 3.56s	remaining: 441ms
890:	learn: 0.7737973	test: 0.6318471	best: 0.6423927 (509)	total: 3.57s	remaining: 436ms
891:	learn

978:	learn: 0.7836317	test: 0.6291560	best: 0.6423927 (509)	total: 3.89s	remaining: 83.4ms
979:	learn: 0.7836317	test: 0.6291560	best: 0.6423927 (509)	total: 3.9s	remaining: 79.6ms
980:	learn: 0.7834101	test: 0.6299616	best: 0.6423927 (509)	total: 3.92s	remaining: 75.8ms
981:	learn: 0.7827869	test: 0.6299616	best: 0.6423927 (509)	total: 3.92s	remaining: 71.8ms
982:	learn: 0.7827869	test: 0.6299616	best: 0.6423927 (509)	total: 3.92s	remaining: 67.8ms
983:	learn: 0.7821630	test: 0.6291560	best: 0.6423927 (509)	total: 3.93s	remaining: 63.8ms
984:	learn: 0.7834101	test: 0.6301020	best: 0.6423927 (509)	total: 3.93s	remaining: 59.8ms
985:	learn: 0.7836317	test: 0.6301020	best: 0.6423927 (509)	total: 3.93s	remaining: 55.8ms
986:	learn: 0.7842536	test: 0.6301020	best: 0.6423927 (509)	total: 3.93s	remaining: 51.8ms
987:	learn: 0.7836317	test: 0.6309068	best: 0.6423927 (509)	total: 3.94s	remaining: 47.8ms
988:	learn: 0.7836317	test: 0.6309068	best: 0.6423927 (509)	total: 3.94s	remaining: 43.8ms


<catboost.core.CatBoostClassifier at 0x130bab9b0>