In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
pd.options.mode.chained_assignment = None
stopwords = set(stopwords.words('english'))
train_data = pd.read_csv('../../data/spooky-author/download/train.csv')
test_data = pd.read_csv('../../data/spooky-author/download/test.csv')

In [2]:
def clean_string(x):
    table = str.maketrans('', '', string.punctuation)
    porter = PorterStemmer()
    x = x.lower().translate(table).split()
    y = [porter.stem(y) for y in x]
    return ' '.join(y)

### Vectorizer + SVD

In [3]:
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=5)
svd = TruncatedSVD(n_components=20, algorithm='arpack', random_state=123)
train_text = list(map(clean_string, list(train_data.text.values)))
test_text = list(map(clean_string, list(test_data.text.values)))
full_text = train_text + test_text
full_text = vectorizer.fit_transform(train_text + test_text)
svd.fit(full_text)
train_word  = svd.transform(vectorizer.transform(train_text))
test_word  = svd.transform(vectorizer.transform(test_text))
print('train:', train_word.shape)
print('test:', test_word.shape)

train: (19579, 20)
test: (8392, 20)


In [4]:
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=5, analyzer='char')
svd = TruncatedSVD(n_components=20, algorithm='arpack', random_state=123)
train_text = list(map(clean_string, list(train_data.text.values)))
test_text = list(map(clean_string, list(test_data.text.values)))
full_text = train_text + test_text
full_text = vectorizer.fit_transform(train_text + test_text)
svd.fit(full_text.asfptype())
train_char  = svd.transform(vectorizer.transform(train_text))
test_char = svd.transform(vectorizer.transform(test_text))
print('train:', train_char.shape)
print('test:', test_char.shape)

train: (19579, 20)
test: (8392, 20)


In [5]:
train_feats = np.hstack([train_word, train_char])
test_feats = np.hstack([test_word, test_char])
scaler = MinMaxScaler()
train_feats = scaler.fit_transform(train_feats)
test_feats = scaler.transform(test_feats)
print('train:', train_feats.shape)
print('test:', test_feats.shape)

train: (19579, 40)
test: (8392, 40)


### naive bayes classifier

In [6]:
folds = KFold(n_splits=5, random_state=2017, shuffle=True)
mapper = {'EAP':0, 'HPL':1, 'MWS':2}
train_data['author'] = train_data['author'].map(lambda x : mapper[x])

In [7]:
valid_score = pd.DataFrame([])
test_score = pd.DataFrame([])
test_score['id'] = test_data['id']

for fold, idx in enumerate(folds.split(train_feats)):
    train_id = idx[0]
    test_id = idx[1]
    valid_temp = pd.DataFrame([])
    valid_temp['id'] = train_data.iloc[test_id,:]['id']
    valid_temp['author'] = train_data.iloc[test_id,:]['author']
    model = MultinomialNB()
    X_train = train_feats[train_id,:]
    y_train = train_data.iloc[train_id,:]['author']
    X_valid = train_feats[test_id,:]
    model.fit(X_train, y_train)
    valid_temp['scores_0'] = model.predict_proba(X_valid)[:,0]
    valid_temp['scores_1'] = model.predict_proba(X_valid)[:,1]
    valid_temp['scores_2'] = model.predict_proba(X_valid)[:,2]
    test_score['scores_0_' + str(fold)] = model.predict_proba(test_feats)[:,0]
    test_score['scores_1_' + str(fold)] = model.predict_proba(test_feats)[:,1]
    test_score['scores_2_' + str(fold)] = model.predict_proba(test_feats)[:,2]
    valid_score = valid_score.append(valid_temp)

In [8]:
test_score['scores_0']  = test_score['scores_0_0']
test_score['scores_0'] += test_score['scores_0_1']
test_score['scores_0'] += test_score['scores_0_2']
test_score['scores_0'] += test_score['scores_0_3']
test_score['scores_0'] += test_score['scores_0_4']
test_score['scores_0'] = test_score['scores_0'] / 5

test_score['scores_1']  = test_score['scores_1_0']
test_score['scores_1'] += test_score['scores_1_1']
test_score['scores_1'] += test_score['scores_1_2']
test_score['scores_1'] += test_score['scores_1_3']
test_score['scores_1'] += test_score['scores_1_4']
test_score['scores_1'] = test_score['scores_1'] / 5

test_score['scores_2']  = test_score['scores_2_0']
test_score['scores_2'] += test_score['scores_2_1']
test_score['scores_2'] += test_score['scores_2_2']
test_score['scores_2'] += test_score['scores_2_3']
test_score['scores_2'] += test_score['scores_2_4']
test_score['scores_2'] = test_score['scores_2'] / 5

test_score = test_score[['id','scores_0', 'scores_1','scores_2']]

In [9]:
scaler = MinMaxScaler()
valid_score.iloc[:,2:] = scaler.fit_transform(valid_score.iloc[:,2:])
valid_score.drop('author', axis=1, inplace=True)
test_score.iloc[:,1:] = scaler.transform(test_score.iloc[:,1:])
print('train:', valid_score.shape)
print('test:', test_score.shape)

train: (19579, 4)
test: (8392, 4)


In [10]:
valid_score.to_csv('../../data/spooky-author/data/train_nb_feats.csv')
test_score.to_csv('../../data/spooky-author/data/test_nb_feats.csv')