In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
stopwords = set(stopwords.words("english"))
train_data = pd.read_csv('../../data/spooky-author/download/train.csv')
test_data = pd.read_csv('../../data/spooky-author/download/test.csv')

### text features

In [2]:
def clean_string(x):
    table = str.maketrans('', '', string.punctuation)
    return x.lower().translate(table)

def count_word(x):
    return len(clean_string(x).split())

def count_word_unique(x):
    return len(set(clean_string(x).split()))

def word_lenght(x):
    words = clean_string(x).split()
    return np.mean([len(x) for x in words])

def count_punct(x):
    return np.sum([1 if y in string.punctuation else 0 for y in x])

def count_upper(x):
    return np.sum([y.title() == y for y in x.split()])

def count_stopword(x):
    return np.sum([1 if y in stopwords else 0 for y in clean_string(x).split()])

def count_stemwords(x):
    porter = PorterStemmer()
    x = clean_string(x).split()
    y = [porter.stem(y) for y in x]
    return np.sum([x[i] != y[i] for i in range(len(x))])

def count_noun(x):
    text = nltk.word_tokenize(clean_string(x))
    text = nltk.pos_tag(text)
    return np.sum([1 if 'NN' in x[1] else 0 for x in text])

def count_adj(x):
    text = nltk.word_tokenize(clean_string(x))
    text = nltk.pos_tag(text)
    return np.sum([1 if 'JJ' in x[1] else 0 for x in text])

def count_det(x):
    text = nltk.word_tokenize(clean_string(x))
    text = nltk.pos_tag(text)
    return np.sum([1 if 'DT' in x[1] else 0 for x in text])

def count_verb(x):
    text = nltk.word_tokenize(clean_string(x))
    text = nltk.pos_tag(text)
    return np.sum([1 if 'VB' in x[1] else 0 for x in text])

def count_pronoun(x):
    text = nltk.word_tokenize(clean_string(x))
    text = nltk.pos_tag(text)
    return np.sum([1 if 'PRP' in x[1] else 0 for x in text])

def count_chars(x):
    return len(x)

In [3]:
train_data['count_word'] = train_data.text.apply(count_word)
train_data['count_word_unique'] = train_data.text.apply(count_word_unique)
train_data['word_lenght'] = train_data.text.apply(word_lenght)
train_data['count_punct'] = train_data.text.apply(count_punct)
train_data['count_upper'] = train_data.text.apply(count_upper)
train_data['count_stemwords'] = train_data.text.apply(count_stemwords)
train_data['count_stopword'] = train_data.text.apply(count_stopword)
train_data['count_noun'] = train_data.text.apply(count_noun)
train_data['count_pronoun'] = train_data.text.apply(count_pronoun)
train_data['count_det'] = train_data.text.apply(count_det)
train_data['count_adj'] = train_data.text.apply(count_adj)
train_data['count_verb'] = train_data.text.apply(count_verb)
train_data['count_chars'] = train_data.text.apply(count_chars)

In [4]:
test_data['count_word'] = test_data.text.apply(count_word)
test_data['count_word_unique'] = test_data.text.apply(count_word_unique)
test_data['word_lenght'] = test_data.text.apply(word_lenght)
test_data['count_punct'] = test_data.text.apply(count_punct)
test_data['count_upper'] = test_data.text.apply(count_upper)
test_data['count_stemwords'] = test_data.text.apply(count_stemwords)
test_data['count_stopword'] = test_data.text.apply(count_stopword)
test_data['count_noun'] = test_data.text.apply(count_noun)
test_data['count_pronoun'] = test_data.text.apply(count_pronoun)
test_data['count_det'] = test_data.text.apply(count_det)
test_data['count_adj'] = test_data.text.apply(count_adj)
test_data['count_verb'] = test_data.text.apply(count_verb)
test_data['count_chars'] = test_data.text.apply(count_chars)

In [9]:
train_data['ratio_punct'] = train_data['count_punct'] / train_data['count_chars']
train_data['ratio_upper'] = train_data['count_upper'] / train_data['count_word']
train_data['ratio_stemwords'] = train_data['count_stemwords'] / train_data['count_word']
train_data['ratio_stopword'] = train_data['count_stopword'] / train_data['count_word']
train_data['ratio_noun'] = train_data['count_noun'] / train_data['count_word']
train_data['ratio_pronoun'] = train_data['count_pronoun'] / train_data['count_word']
train_data['ratio_det'] = train_data['count_det'] / train_data['count_word']
train_data['ratio_adj'] = train_data['count_adj'] / train_data['count_word']
train_data['ratio_verb'] = train_data['count_verb'] / train_data['count_word']

In [10]:
test_data['ratio_punct'] = test_data['count_punct'] / test_data['count_chars']
test_data['ratio_upper'] = test_data['count_upper'] / test_data['count_word']
test_data['ratio_stemwords'] = test_data['count_stemwords'] / test_data['count_word']
test_data['ratio_stopword'] = test_data['count_stopword'] / test_data['count_word']
test_data['ratio_noun'] = test_data['count_noun'] / test_data['count_word']
test_data['ratio_pronoun'] = test_data['count_pronoun'] / test_data['count_word']
test_data['ratio_det'] = test_data['count_det'] / test_data['count_word']
test_data['ratio_adj'] = test_data['count_adj'] / test_data['count_word']
test_data['ratio_verb'] = test_data['count_verb'] / test_data['count_word']

In [12]:
train_data.drop(['text','author'], axis=1, inplace=True)
test_data.drop(['text'], axis=1, inplace=True)

In [13]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count_word,19579.0,26.702487,19.044772,2.0,15.0,23.0,34.0,861.0
count_word_unique,19579.0,22.716073,13.096218,1.0,14.0,21.0,29.0,410.0
word_lenght,19579.0,4.472716,0.577331,1.888889,4.111111,4.444444,4.804348,11.0
count_punct,19579.0,3.759283,3.009744,1.0,2.0,3.0,5.0,71.0
count_upper,19579.0,2.203994,1.996668,0.0,1.0,2.0,3.0,46.0
count_stemwords,19579.0,7.42755,5.794203,0.0,4.0,6.0,10.0,246.0
count_stopword,19579.0,13.61377,10.006386,0.0,7.0,12.0,18.0,435.0
count_noun,19579.0,6.452832,5.029767,0.0,3.0,5.0,8.0,206.0
count_pronoun,19579.0,1.99336,2.117201,0.0,1.0,2.0,3.0,74.0
count_det,19579.0,3.306298,2.980991,0.0,1.0,3.0,5.0,105.0


In [14]:
test_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count_word,8392.0,26.453051,19.532626,3.0,15.0,23.0,34.0,818.0
count_word_unique,8392.0,22.542302,13.217536,3.0,14.0,21.0,29.0,377.0
word_lenght,8392.0,4.47844,0.576544,2.25,4.111111,4.459459,4.8125,8.6
count_punct,8392.0,3.682674,2.927275,1.0,2.0,3.0,5.0,48.0
count_upper,8392.0,2.168613,1.9454,0.0,1.0,2.0,3.0,54.0
count_stemwords,8392.0,7.398832,5.983552,0.0,4.0,6.0,10.0,227.0
count_stopword,8392.0,13.529194,10.253577,0.0,7.0,12.0,17.0,430.0
count_noun,8392.0,6.352955,4.981991,0.0,3.0,5.0,8.0,164.0
count_pronoun,8392.0,1.977836,2.0918,0.0,1.0,1.0,3.0,70.0
count_det,8392.0,3.26418,2.933078,0.0,1.0,3.0,5.0,85.0


In [15]:
train_data.to_csv('../../data/spooky-author/data/train_text_feats.csv', index=False)
test_data.to_csv('../../data/spooky-author/data/test_text_feats.csv', index=False)