# Text Feature Engineering

In [2]:
import re
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
# nltk.download('wordnet')

### Split the data in a training set to apply transformations

In [4]:
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['description'], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                    random_state = 42)

In [6]:
# T_train on it's own is a series
type(X_train)

pandas.core.series.Series

# one hot encoding categories
- genres
- format
- publisher
- month (most likely not)

# vectorize the description
- look at html format
- key words
- take out NYT reference

### cleaning html tags
- as well as exploring vectorizing in pandas.

In [7]:
# no_html = df.description[3].replace('<i>', '')
# no_html
df.description[1371]
# this is what we want to act on
# df.description.values

# use regrex to clean the html tags
# no_reg_html = re.sub('<.{1,9}>',' ', df.description[1372])
# no_reg_html
# vec_html = re.sub('<.{1,9}>',' ', df.description.values)

'<b>Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature. </b>'

#### function to take out the tags

In [8]:
def no_html(text):
    return re.sub('<.{1,9}>',' ', text)

#### function to remove years and numbers in general

In [9]:
def no_nums(text):
    return re.sub('\d+', ' ', text)

#### function to remove punc

In [166]:
def no_punc(text):
    return re.sub('\.|-|\(|\)|\"|,|\?', ' ', text)

In [170]:
X_train[2]

"  an intelligent explanation of the mechanisms that produced the crisis and the response to it   one of the great strengths of tooze's book is to demonstrate the deeply intertwined nature of the european and american financial systems      the new york times book review       from a prizewinning economic historian  an eye-opening reinterpretation of the   economic crisis   "

In [171]:
no_punc(X_train[2])

"  an intelligent explanation of the mechanisms that produced the crisis and the response to it   one of the great strengths of tooze's book is to demonstrate the deeply intertwined nature of the european and american financial systems      the new york times book review       from a prizewinning economic historian  an eye opening reinterpretation of the   economic crisis   "

#### function to remove cases

In [172]:
def no_upper(text):
    return text.lower()

#### method 1 np.vectorize

In [10]:
no_html_v = np.vectorize(no_html)

In [11]:
no_html_v([X_train[1371],df.description[1372] ])

array([' Nonpareil science writer David Quammen explains how recent discoveries in molecular biology can change our understanding of evolution and life’s history, with powerful implications for human health and even our own human nature.  ',
       ' NAMED A BEST BOOK OF 2018 BY  THE NEW YORK TIMES  "Somehow Casey Gerald has pulled off the most urgently political, most deeply personal, and most engagingly spiritual statement of our time by just looking outside his window and inside himself. Extraordinary." - Marlon James "Staccato prose and peripatetic storytelling combine the cadences of the Bible with an urgency reminisc '],
      dtype='<U381')

In [12]:
X_train = no_html_v(X_train)

In [13]:
# some series magic... originally made the function for a dataframe
# X_train["no_html"] = no_html_v(X_train.values)

In [14]:
# X_train = X_train['no_html']

In [153]:
X_train

array(['an exquisite memoir about how to live and love every day with  death in the room   from poet nina riggs  mother of two young sons and the direct descendant of ralph waldo emerson  in the tradition of ',
       "the vision wants to be human  and what's more human than family? so he heads back to the beginning  to the laboratory where ultron created him and molded him into a weapon  the place where he first rebelled against his given destiny and imagined that he could be more -that he could be a man  there  he builds them  a wife  virginia  two teenage twins  viv and vin  they look",
       "  an intelligent explanation of the mechanisms that produced the crisis and the response to it   one of the great strengths of tooze's book is to demonstrate the deeply intertwined nature of the european and american financial systems      the new york times book review       from a prizewinning economic historian  an eye-opening reinterpretation of the   economic crisis   ",
       ...,
    

#### apply numpy function

In [16]:
no_nums_v = np.vectorize(no_nums)

In [17]:
X_train = no_nums_v(X_train)

In [173]:
no_punc_v = np.vectorize(no_punc)

In [174]:
X_train = no_punc_v(X_train)

In [151]:
no_upper_v = np.vectorize(no_upper)

In [152]:
X_train = no_upper_v(X_train)

#### method 2 pandas apply, this doesn't not apply to X_train

In [18]:
# df['no_html_apply'] = df.description.apply(no_html)

# df.head()

## Cleaning text
- punctuation
- lowercase
- lemmas

In [228]:
cv = CountVectorizer(stop_words='english')

In [229]:
X_train

array(['an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of',
       "the vision want to be human and what's more human than family so he head back to the begin to the laboratory where ultron create him and mold him into a weapon the place where he first rebel against his give destiny and imagine that he could be more that he could be a man there he build them a wife virginia two teenage twin viv and vin they look",
       "an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis",
       ...,
       'a move story of love friendship grief heal and the magical

In [230]:
v_X_train = cv.fit_transform(X_train)

In [84]:
# act on the CountVectorizer object to get feature names
# this is the first past, with numbers and no stop words
len(cv.get_feature_names())
# number of words 7162

7162

In [87]:
# second pass with stop words
len(cv.get_feature_names())
# number of words 6902

6902

In [103]:
# third pass, no numbers and stop words
print(len(cv.get_feature_names()))
# cv.get_feature_names()
# number of words 6799

6799


In [None]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))
# number of words 5847 

### POS for lemmas

In [21]:
nltk.pos_tag(['5'])[0][1][0]
# ADJ = adjective
# NOUN = noun
# VERB = verb
# ADV = adverb

'C'

In [38]:
lemmatizer = WordNetLemmatizer()
# n = noun
# v = verb
# a = adjective
# r = adverb

In [23]:
def mapping_pos(word):
    if word.isdigit():
        return ' '
    else:
        tag = nltk.pos_tag([word])[0][1][0]
        if tag == 'J':
            return 'a'
        elif tag == 'V':
            return 'v'
        elif tag == 'R':
            return 'r'
        else:
            return 'n'

In [24]:
mapping_pos('5')

' '

In [25]:
lemmatizer.lemmatize('bigger', mapping_pos('bigger'))
lemmatizer.lemmatize('twin', mapping_pos('twin'))

'twin'

In [203]:
kids = ['girls', 'boys',' and']
kids_string = 'girls boys and'
size = ['bigger', 'biggest']
size_string = 'bigger biggest'
def lemmtize_it(sentences):
    # split the string of sentences
    sentence = sentences.split(' ')
    cleaned = [lemmatizer.lemmatize(i, mapping_pos(i)) for i in sentence if i is not '']
    return " ".join(cleaned)



In [204]:
lemmtize_it(size_list)

'big big'

In [197]:
lemmtize_it(X_train[2])
# X_train[0].split(' ')

"an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis"

In [190]:
X_train[0]

'an exquisite memoir about how to live and love every day with  death in the room   from poet nina riggs  mother of two young sons and the direct descendant of ralph waldo emerson  in the tradition of '

In [186]:
[lemmatizer.lemmatize(i, mapping_pos(i)) for i in kids]

['girl', 'boy', ' and']

In [207]:
# doing this processing before transforming with the count vectorizer
# class Nyt_LemmaTokenizer(object):
#     def __call__(self, sentences):
#         sentence = sentences.split(' ')
#         return [lemmatizer.lemmatize(i, mapping_pos(i)) for i in sentence]

In [28]:
# lemmtize_it(X_train[2])

In [29]:
X_train[0]

'An exquisite memoir about how to live--and love--every day with "death in the room," from poet Nina Riggs, mother of two young sons and the direct descendant of Ralph Waldo Emerson, in the tradition of '

In [209]:
v_lemmtize_it =  np.vectorize(lemmtize_it)

In [227]:
X_train = v_lemmtize_it(X_train)

In [237]:
# accompanied
lemmatizer.lemmatize('brilliantly', 'a')
# lemmtize_it('brilliantly')

'brilliantly'

In [214]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))

6799


In [231]:
# fourth pass will use lemmas
print(len(cv.get_feature_names()))
# 5847 

5847


In [232]:
cv.get_feature_names()

['aaron',
 'ab',
 'abandon',
 'abandonment',
 'abby',
 'abc',
 'abduct',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abou',
 'abraham',
 'abroad',
 'absconds',
 'absence',
 'absolutely',
 'absurd',
 'absurdity',
 'absurdly',
 'abuser',
 'academic',
 'academies',
 'academy',
 'accelerate',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accident',
 'acclaim',
 'accolade',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accustom',
 'ace',
 'acevedo',
 'achieve',
 'achievement',
 'acknowledge',
 'acne',
 'acquire',
 'act',
 'action',
 'activism',
 'activist',
 'actor',
 'actress',
 'actually',
 'acutely',
 'ad',
 'ada',
 'adam',
 'adapt',
 'adapts',
 'add',
 'addict',
 'addiction',
 'addictive',
 'address',
 'adhd',
 'adjei',
 'adjust',
 'administration',
 'admiral',
 'admire',
 'admit',
 'adolescence',
 'adolescent',
 'adorable',
 'adrienne',
 'adult',
 'adulthood',
 'adulting',
 'advance',
 'advanced',
 'advancement',
 'adventure',
 'adventurer',
 'adversity',
 'advice',
 '

In [215]:
# feature names before lemmas
cv.get_feature_names()

['aaron',
 'ab',
 'abandon',
 'abandoned',
 'abandonment',
 'abandons',
 'abby',
 'abc',
 'abducted',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abou',
 'abraham',
 'abroad',
 'absconds',
 'absence',
 'absolutely',
 'absurd',
 'absurdities',
 'absurdity',
 'absurdly',
 'abusers',
 'academic',
 'academies',
 'academy',
 'accelerated',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accident',
 'acclaimed',
 'accolade',
 'accompanied',
 'accomplished',
 'accord',
 'according',
 'account',
 'accustomed',
 'aced',
 'acevedo',
 'achieve',
 'achievement',
 'acknowledged',
 'acne',
 'acquire',
 'act',
 'action',
 'actions',
 'activism',
 'activist',
 'activists',
 'actor',
 'actors',
 'actress',
 'actually',
 'acutely',
 'ad',
 'ada',
 'adam',
 'adams',
 'adapt',
 'adapted',
 'adapts',
 'add',
 'addicted',
 'addiction',
 'addictive',
 'address',
 'addressing',
 'adds',
 'adhd',
 'adjei',
 'adjusting',
 'administration',
 'admiral',
 'admired',
 'admiring',
 'admit',
 'adolescence',
 

## Stop words

In [158]:
# this base of stop words is a frozen set
# text.ENGLISH_STOP_WORDS


# testing out a sentence to vectorize

In [33]:
X_train[0:5]

array(['An exquisite memoir about how to live--and love--every day with "death in the room," from poet Nina Riggs, mother of two young sons and the direct descendant of Ralph Waldo Emerson, in the tradition of ',
       "The Vision wants to be human, and what's more human than family? So he heads back to the beginning, to the laboratory where Ultron created him and molded him into a weapon. The place where he first rebelled against his given destiny and imagined that he could be more -that he could be a man. There, he builds them. A wife, Virginia. Two teenage twins, Viv and Vin. They look",
       ' "An intelligent explanation of the mechanisms that produced the crisis and the response to it...One of the great strengths of Tooze\'s book is to demonstrate the deeply intertwined nature of the European and American financial systems."   --The New York Times Book Review       From a prizewinning economic historian, an eye-opening reinterpretation of the   economic crisis ( ',
       ' In 

In [34]:
type(X_train[2])

numpy.str_

In [35]:
X_train[2]

' "An intelligent explanation of the mechanisms that produced the crisis and the response to it...One of the great strengths of Tooze\'s book is to demonstrate the deeply intertwined nature of the European and American financial systems."   --The New York Times Book Review       From a prizewinning economic historian, an eye-opening reinterpretation of the   economic crisis ( '

In [233]:
X_train[2]

"an intelligent explanation of the mechanism that produce the crisis and the response to it one of the great strength of tooze's book be to demonstrate the deeply intertwine nature of the european and american financial system the new york time book review from a prizewinning economic historian an eye opening reinterpretation of the economic crisis"

In [179]:
X_train[0]

'an exquisite memoir about how to live and love every day with  death in the room   from poet nina riggs  mother of two young sons and the direct descendant of ralph waldo emerson  in the tradition of '

In [205]:
# this doesn't work... need to process charaters before
lemmtize_it(X_train[0])

'an exquisite memoir about how to live and love every day with death in the room from poet nina riggs mother of two young son and the direct descendant of ralph waldo emerson in the tradition of'

In [37]:
re.sub
# X_train[0].replace('.')

<function re.sub(pattern, repl, string, count=0, flags=0)>

In [146]:
def no_punc(text):
    return re.sub('\.|--|\(|\)|\"|,', ' ', text)

def no_upper(text):
    return text.lower()

In [148]:
no_punc(no_upper(X_train[2]))

"  an intelligent explanation of the mechanisms that produced the crisis and the response to it   one of the great strengths of tooze's book is to demonstrate the deeply intertwined nature of the european and american financial systems      the new york times book review       from a prizewinning economic historian  an eye-opening reinterpretation of the   economic crisis   "

## testing lemmas and POS

In [113]:
# accompanied
lemmatizer.lemmatize('accessibility', wordnet.ADJ)

'accessibility'

In [65]:
from nltk.corpus import wordnet as wn

In [112]:

print(wordnet.morphy("accessibility",wordnet.ADJ))

None


In [114]:
for ss in wn.synsets('accessibility'):
    print(ss.lemmas()[0].pertainyms())
    thingy = ss.lemmas()[0].pertainyms()

[]
[]


In [115]:

thingy

[]

In [117]:
for ss in wn.synsets('accessibility'):
    print(ss)#[0].pertainyms())
#     thingy = ss.lemmas()[0].pertainyms()
    print(ss.lemmas())
    thingq = ss.lemmas()[1]

Synset('handiness.n.02')
[Lemma('handiness.n.02.handiness'), Lemma('handiness.n.02.accessibility'), Lemma('handiness.n.02.availability'), Lemma('handiness.n.02.availableness')]
Synset('approachability.n.01')
[Lemma('approachability.n.01.approachability'), Lemma('approachability.n.01.accessibility')]


In [118]:
thingq

Lemma('approachability.n.01.accessibility')

In [110]:
ss.lemmas()

[Lemma('approachability.n.01.approachability'),
 Lemma('approachability.n.01.accessibility')]

In [154]:
wn.lemmas('accessibility')[0].derivationally_related_forms()[0].name()

'accessible'

## to dos

#### review endings of Adverbs
- ly

visualize counts and train dataframe
put processing in .py file