# Feature Engineering in NLP

In [1]:
# import basic library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

## Data Preparation

In [2]:
text_df = pd.read_csv('data/txt/News articles/blaise.txt', header=None, sep=';', names=['text'])
text_df.shape

(197, 1)

In [3]:
text_df['text']

0      Once the network is trained, we can feed in a ...
1      Convolutional neural networks are very general...
2      Modern, sophisticated machine learning techniq...
3      Every machine learning system has parameters —...
4      One technical pitfall to guard against is over...
                             ...                        
192    [21] In this statistic “black” and “white” bot...
193    [22] Racial implicit bias is measured using re...
194              [23] Their subjects are all Chinese men
195    [24] This research, as well as the lack of evi...
196    [25] Malcolm Gladwell’s book Blink popularized...
Name: text, Length: 197, dtype: object

## Create SpaCy English Model

In [4]:
nlp = spacy.load('en_core_web_sm')

# empty list
docs = []
for text in text_df['text']:
    doc = nlp(text)
    docs.append(doc)

## Tokenization

In [5]:
tokens = [token.text for token in docs]
print(tokens[0:5])

['Once the network is trained, we can feed in a photo, and we get out the year in which the system guesses it was taken. For example, for the following two photos ChronoNet guesses 1951 (left) and 1971 (right):', 'Convolutional neural networks are very general and very powerful. As an example, consider Ilya Kostrikov and Tobias Weyand’s ChronoNet, a CNN that guesses the year in which a photo was taken. Since public sources can provide large numbers of digitally archived photos taken over the past century with known dates, it’s relatively straightforward to obtain labeled data (dated photos, in this case) with which to train this network.', 'Modern, sophisticated machine learning techniques like convolutional neural networks (CNNs) have many millions of parameters, hence need a great deal of training data to avoid overfitting. Obtaining enough labelled data to both train and test a system is often the greatest practical challenge facing a machine learning researcher.', 'Every machine le

## Lemmatization

In [6]:
lemmas = [token.lemma_ for doc in docs for token in doc]
len(lemmas)

10242

## Removing non-alpha chars using stopwords

In [7]:
lemmas_alpha = [lemma for lemma in lemmas if lemma.isalpha() or lemma == '-PRON-']

len(lemmas_alpha)

8455

## Stopwords

In [8]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

lemmas_cleaned = [lemma for lemma in lemmas_alpha if lemma not in stopwords]
len(lemmas_cleaned)

4579

## Putting it all together

In [9]:
def preprocess(text):
    doc = nlp(text)
    # lemmas
    lemmas = [token.lemma_ for token in doc]
    lemmas_cleaned = [lemma for lemma in lemmas if lemma.isalpha() \
        and lemma not in stopwords]
    return ' '.join(lemmas_cleaned)

text_df['cleaned'] = text_df['text'].apply(preprocess)
text_df['cleaned']

0      network train feed photo year system guess exa...
1      convolutional neural network general powerful ...
2      modern sophisticated machine learning techniqu...
3      machine learning system parameter learn simple...
4      technical pitfall guard overfitte happen machi...
                             ...                        
192    statistic black white exclude population ident...
193    racial implicit bias measure use reaction time...
194                                  subject chinese man
195    research lack evidence accuracy impression rev...
196    Malcolm Gladwell book Blink popularize idea sn...
Name: cleaned, Length: 197, dtype: object

## Part-of-speech tagging

In [10]:
docs_cleaned = []
for text in text_df['cleaned']:
    doc_cleaned = nlp(text)
    docs_cleaned.append(doc_cleaned)

In [11]:
pos = [(token.text, token.pos_) for doc in docs_cleaned for token in doc]

print(len(pos))
print(pos[:10])

4349
[('network', 'NOUN'), ('train', 'NOUN'), ('feed', 'NOUN'), ('photo', 'NOUN'), ('year', 'NOUN'), ('system', 'NOUN'), ('guess', 'VERB'), ('example', 'NOUN'), ('follow', 'NOUN'), ('photo', 'NOUN')]


## Named entity recognition

In [12]:
ner = [(ent.text, ent.label_) for doc in docs_cleaned for ent in doc.ents]

print(len(ner))
print(ner[:20])

224
[('ChronoNet', 'ORG'), ('convolutional neural network', 'ORG'), ('Ilya Kostrikov Tobias Weyand', 'FAC'), ('CNN', 'ORG'), ('past century', 'DATE'), ('cnn', 'ORG'), ('era pervasive camera', 'ORG'), ('AI', 'GPE'), ('Wu Zhang', 'PERSON'), ('Xiaolin Wu', 'PERSON'), ('Inference Criminality', 'ORG'), ('arXiv', 'NORP'), ('November', 'DATE'), ('Wu Zhang', 'PERSON'), ('rapid development artificial intelligence', 'ORG'), ('new era', 'ORG'), ('algorithm bias', 'PERSON'), ('today', 'DATE'), ('Italians', 'NORP'), ('Italians', 'NORP')]


## Bag of words and n-gram

In [13]:
# import library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
text_df.head()

Unnamed: 0,text,cleaned
0,"Once the network is trained, we can feed in a ...",network train feed photo year system guess exa...
1,Convolutional neural networks are very general...,convolutional neural network general powerful ...
2,"Modern, sophisticated machine learning techniq...",modern sophisticated machine learning techniqu...
3,Every machine learning system has parameters —...,machine learning system parameter learn simple...
4,One technical pitfall to guard against is over...,technical pitfall guard overfitte happen machi...


### Bag of Words

In [15]:
# bag of words
cvec = CountVectorizer()

text_vec = cvec.fit_transform(text_df['cleaned'])

In [16]:
cvec_df = pd.DataFrame(text_vec.toarray(), columns=cvec.get_feature_names()).add_prefix('cvec_')

cvec_df.head()

Unnamed: 0,cvec_ability,cvec_able,cvec_abnormal,cvec_abolitionist,cvec_absolutely,cvec_abstract,cvec_accelerate,cvec_access,cvec_accomplish,cvec_accord,...,cvec_wrong,cvec_wrongly,cvec_wu,cvec_xi,cvec_xiaolin,cvec_year,cvec_yellow,cvec_yes,cvec_young,cvec_zhang
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### n-grams

In [18]:
# 2-grams
bigrams_model = CountVectorizer(stop_words='english', ngram_range= (2,2))

two_grams = bigrams_model.fit_transform(text_df['cleaned'])

In [19]:
two_grams_df = pd.DataFrame(two_grams.toarray(), columns=bigrams_model.get_feature_names()).add_prefix('two-grams_')

two_grams_df.head()

Unnamed: 0,two-grams_ability read,two-grams_ability recognize,two-grams_ability tend,two-grams_able ask,two-grams_able date,two-grams_able guess,two-grams_able intuitive,two-grams_able learn,two-grams_able memorize,two-grams_able reliably,...,two-grams_zhang claim,two-grams_zhang criminal,two-grams_zhang experiment,two-grams_zhang guess,two-grams_zhang host,two-grams_zhang paper,two-grams_zhang relate,two-grams_zhang result,two-grams_zhang use,two-grams_zhang work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# most common words
two_grams_df.sum().sort_values(ascending=False).head(10)

two-grams_machine learning      23
two-grams_wu zhang              23
two-grams_non criminal          14
two-grams_machine learn         12
two-grams_deep learning         11
two-grams_untrustworthy face     8
two-grams_facial appearance      8
two-grams_learning technique     7
two-grams_human judge            7
two-grams_face image             6
dtype: int64

In [21]:
# 3-grams
trigrams_model = CountVectorizer(stop_words='english', ngram_range= (3,3))

three_grams = trigrams_model.fit_transform(text_df['cleaned'])

In [22]:
three_grams_df = pd.DataFrame(three_grams.toarray(), columns=trigrams_model.get_feature_names()).add_prefix('three-grams_')

three_grams_df.head()

Unnamed: 0,three-grams_ability read intention,three-grams_ability recognize criminality,three-grams_ability tend creative,three-grams_able ask computer,three-grams_able date photo,three-grams_able guess gender,three-grams_able intuitive sense,three-grams_able learn subtler,three-grams_able memorize right,three-grams_able reliably distinguish,...,three-grams_zhang paper automated,three-grams_zhang paper illustrate,three-grams_zhang paper purport,three-grams_zhang relate work,three-grams_zhang result consistent,three-grams_zhang result exactly,three-grams_zhang use id,three-grams_zhang use label,three-grams_zhang use scare,three-grams_zhang work use
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# most common words
three_grams_df.sum().sort_values(ascending=False).head(10)

three-grams_machine learning technique        5
three-grams_criminal non criminal             5
three-grams_trustworthy untrustworthy face    4
three-grams_convolutional neural network      4
three-grams_image non criminal                3
three-grams_wu zhang criminal                 3
three-grams_wu zhang claim                    3
three-grams_valla et al                       3
three-grams_wu zhang use                      3
three-grams_social perception face            3
dtype: int64

## Tfidf

In [25]:
tfidf = TfidfVectorizer(stop_words='english')

tvec = tfidf.fit_transform(text_df['cleaned'])

tvec_df = pd.DataFrame(tvec.toarray(), columns=tfidf.get_feature_names()).add_prefix('tvec_')

tvec_df.iloc[0].sort_values(ascending=False).head(10)

tvec_guess        0.485483
tvec_photo        0.379790
tvec_feed         0.322695
tvec_follow       0.322695
tvec_leave        0.299310
tvec_chrononet    0.242741
tvec_train        0.242741
tvec_right        0.242741
tvec_network      0.242741
tvec_year         0.219356
Name: 0, dtype: float64