# Spooky Author Identification
![](https://image.freepik.com/free-photo/halloween-pumpkin-on-a-wooden-table_1048-3118.jpg )


Да погледнем правилата на състезанието:

https://www.kaggle.com/c/spooky-author-identification


In [2]:
# Dataset
import pandas as pd
import numpy as np
import matplotlib as plt

train = pd.read_csv("data/train.csv", index_col=['id'])
test = pd.read_csv("data/test.csv", index_col=['id'])

print(train.shape, test.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1)
{'author'}


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [8]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83195466  0.83466135  0.83187739]
[-0.42530307 -0.418245   -0.42500535]


Ще опитам невронна мрежа с 1 скрит layer и 20 неврона

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Normalizer
  

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[-3.57745484 -2.79645442 -3.21379609]


Нека да запишем какво пробваме
* 
    StandardScaller
    MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(15,), random_state=26)
    
    [ 0.8359375   0.83558075  0.83478927]
    [-0.46610875 -0.47354423 -0.4626415 ]
     


*   
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='adam', alpha=1e-5, random_state=26))
    
    [ 0.83287377  0.83343549  0.83417625]
    [-0.48226704 -0.49633421 -0.47903389]
    

*   
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))

    [ 0.83501838  0.83757279  0.83632184]
    [-0.46630463 -0.47214645 -0.46082846]
    

*    
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))
    [-2.64142297 -1.5838596  -2.74649462]


Невронната мрежа не се справи out of the box по-добре от naivebs, ще пробвам svm и след това стакинг

In [11]:
# from sklearn.base import TransformerMixin

# class DenseTransformer(TransformerMixin):

#     def transform(self, X, y=None, **fit_params):
#         return X.todense()

#     def fit_transform(self, X, y=None, **fit_params):
#         self.fit(X, y, **fit_params)
#         return self.transform(X)

#     def fit(self, X, y=None, **fit_params):
#         return self


# from sklearn.naive_bayes import GaussianNB
# pipeline = Pipeline([
#     ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
#                                  max_df=0.8, lowercase=False)),
#     ('normalizer', Normalizer()),
#     ('dense', DenseTransformer()),
#     ('clf', GaussianNB())
# ])

# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
#                       scoring='neg_log_loss'))

Това не беше добра идея (todense), почти ми свърши паметта

In [14]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[
                                      ('mnb', MultinomialNB()),
                                      ('mnbTuned', MultinomialNB(alpha=0.01)),
                                    ], voting="soft")

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', voting)
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, scoring='neg_log_loss'))

[-0.47992097 -0.4751916  -0.47437537]


VotingClassifier с MultinomialNb, LogisticRegression, RandomForest -> [-0.5660945  -0.56260754 -0.5608995 ]

VotingClassifier с MultinomialNb, LogisticRegression -> [-0.48491232 -0.48166042 -0.47702683]

VotingClassifier с MultinomialNb,MultinomialNb(alpha=0.01 -> [-0.48093573 -0.47613605 -0.47296766]

VotingClassifier с MultinomialNb,MultinomialNb(alpha=0.01 -> [-0.48093573 -0.47613605 -0.47296766]
    TfidfVectorizer : 
        lowercase = True [-0.48491232 -0.48166042 -0.47702683]
        lowercase = False [-0.47992097 -0.4751916  -0.47437537]

In [16]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(base_estimator=MultinomialNB(), n_estimators=50, random_state=26)

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', adaboost)
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, scoring='neg_log_loss'))

[-1.09784003 -1.09784169 -1.09783464]


Adaboost не работи добре (поне така казват), когато модела, който му се подава е достатъчно добър. Това потвърждава че nb wins here

In [17]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l2', learning_rate='optimal', random_state=26)
calibrated_clf = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', calibrated_clf)
])

print(cross_val_score(pipeline, train.text, train.author, cv=3,scoring='neg_log_loss'))



[-0.46200346 -0.47072398 -0.45530661]


Горното е нещо като svc + predict_probа
penalty=elasticnet се справя по-зле отколкотко l2 регуларизация


### Нека да пробваме word2vec
Ползвам претраниран модел от https://github.com/stanfordnlp/GloVe


In [25]:
# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip

with open("/home/radoslav/ML/glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0].decode("utf-8"): np.array( list( map(float,line.split()[1:]) ))
           for line in lines}
    
w2v

{'subtracting': array([ 0.48731 , -0.89852 ,  0.58628 , -0.77398 ,  0.84084 ,  0.27935 ,
         0.66807 , -0.34919 , -0.53451 ,  0.28834 ,  0.074242,  0.14796 ,
         0.069395, -0.52741 ,  0.54256 , -0.72247 , -0.43349 , -0.91146 ,
        -0.11995 , -0.7727  ,  0.82568 , -1.3249  ,  1.2737  , -0.36607 ,
        -0.39372 ,  0.76102 , -0.45219 , -0.16458 ,  0.24185 ,  0.4754  ,
         0.34141 ,  0.78964 ,  0.37028 ,  0.1219  , -0.255   , -0.34079 ,
        -0.66942 ,  0.89218 ,  0.38361 ,  0.27842 ,  0.65074 , -0.021613,
         0.19973 ,  0.80647 , -1.1465  , -1.595   ,  0.66827 ,  0.82666 ,
         1.0648  ,  0.095217]),
 'shenango': array([-0.22581 , -0.38694 ,  0.13432 , -0.19649 , -0.68066 , -0.44579 ,
        -0.2773  ,  0.14105 ,  0.72439 , -0.7158  ,  0.15072 , -0.50614 ,
         0.72751 , -0.88268 , -1.788   ,  0.38935 ,  0.13587 ,  0.27328 ,
         0.78929 ,  0.43543 , -0.43378 , -0.59479 ,  0.28599 , -0.38478 ,
        -0.096505,  0.81908 ,  0.027636,  0.86593 , -

In [26]:
from sklearn.svm import SVC

class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 50

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in CountVectorizer().fit([words]).vocabulary_.keys() if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
  


In [27]:
MeanEmbeddingVectorizer(w2v).fit_transform(["text of me", "more text", "maskata"])

array([[  2.96476667e-01,   4.16796667e-01,  -1.09083500e-01,
         -2.77376667e-01,   6.15723333e-01,   1.46513333e-01,
         -8.28600000e-02,  -3.65036667e-01,  -3.01563333e-01,
          1.56476667e-01,  -1.79856000e-01,   3.94356667e-01,
         -1.13476667e-01,  -3.02346667e-01,   5.51693333e-01,
         -5.70900000e-02,  -2.46901667e-01,  -3.69876333e-01,
          1.55863333e-01,  -3.41371333e-01,  -5.01066667e-02,
          2.22073333e-01,   3.97290333e-01,   1.74400000e-01,
          7.90640000e-01,  -1.57374333e+00,  -8.32650000e-01,
          4.13666667e-02,   1.53416667e-01,  -7.50936667e-01,
          3.30063333e+00,  -1.70573333e-01,  -4.49246667e-01,
         -2.60870000e-01,  -1.49656667e-01,  -8.16168667e-02,
          3.06500000e-01,   3.47900000e-03,  -1.34223333e-01,
          2.95000000e-03,   5.02686667e-01,   4.27100000e-02,
         -6.22240000e-02,   4.67173333e-02,  -9.41233333e-02,
          3.33340000e-01,   2.18053333e-01,   6.93633333e-02,
        

In [28]:
from collections import defaultdict

class TfidfEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 50

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [29]:
TfidfEmbeddingVectorizer(w2v).fit_transform(["text of me", "more text", "maskata"])

array([[ -1.13205672e-02,   1.27625442e+00,   1.24842331e+00,
          1.03171819e+00,   2.17858951e-01,  -4.91140561e-01,
          4.32793429e-01,  -1.22902120e+00,  -1.29422748e-02,
         -1.28656898e-01,   3.61159958e-02,   5.49338191e-01,
         -5.49898992e-01,  -6.56790241e-01,   1.40432426e-01,
         -2.11383637e-01,  -7.90840410e-01,  -2.72199364e-01,
         -5.13745638e-01,  -4.65160727e-01,  -6.12493489e-01,
         -3.30666930e-01,   1.01837622e+00,   5.91328477e-01,
         -3.18106756e-01,  -4.65992807e-01,  -4.82740028e-01,
         -1.92993260e-01,   2.00519810e-01,  -5.45932113e-01,
          2.97634048e+00,   2.37669383e-02,  -2.87231767e-01,
          6.21834005e-01,  -8.36678487e-02,  -8.67400588e-01,
          7.39098942e-01,  -7.40250955e-01,   3.15137431e-01,
          2.39207474e-01,   1.36172654e+00,   3.04427250e-02,
          2.07133651e-01,   3.29205059e-02,  -4.60512630e-02,
          2.60976993e-01,   4.41704417e-01,  -6.38602747e-01,
        

In [30]:
pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("SVM", SVC(kernel="linear", probability=True))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-0.8596242  -0.86446726 -0.86176207]


In [31]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("NB", BernoulliNB())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-1.04477902 -1.00216502 -1.02352759]


In [33]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("Logistic",LogisticRegression())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-0.86196739 -0.86640521 -0.86253987]


In [34]:
pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("SVM", SVC(kernel="linear", probability=True))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))


[-0.97906212 -0.98452975 -0.97140394]


In [None]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("NB", BernoulliNB())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

In [None]:

pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("Logistic",LogisticRegression(C=100))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

Хмм общо взето word2vec концепцията не дава по-добри резултати засега

### Cheat it with spacy

Ще пробвам LDA & NMF да открия разни сродни теми

In [4]:
from sklearn.decomposition import NMF
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(train.text)

nmf = NMF(n_components=3, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


In [5]:
def display_topics(model, vectorizer, bag_words, no_top_words=10, no_top_documents=10):
    W = model.transform(bag_words)

    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic {}:".format (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print ("-------And top documents:-------------")
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(train.text[doc_index])
            print()
        print("#####################################")

In [6]:
display_topics(nmf, tfidf_vectorizer, tfidf)

Topic 0:
man time old saw night say life great came day
-------And top documents:-------------
Never saw the old man.

Thus man is individualized.

For it had been a man.

The old man was dead.

In short, I became a new man, and lived a man's life.

And for this reason when a man's diddled we say he's "done."

The man was great I say it even now, knowing as much as I do.

"Great man" said Bentley.

After a time the man left me alone in the attic room.

So true it is, that man's mind alone was the creator of all that was good or great to man, and that Nature herself was only his first minister.

#####################################
Topic 1:
said dupin let sir little yes dear quite mean pounds
-------And top documents:-------------
"Bête" said the first.

That was what she said.

What it said of myself was.

"Get out o' that" said a third.

"It is nothing," he said, at last.

This will I do, I said.

"Proceed," said I. "Or not," said Dupin.

What he said tallied up with what I'd heard.


In [7]:
from sklearn.decomposition import LatentDirichletAllocation

tf_vectorizer = CountVectorizer(max_df=0.85, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(train.text)

lda = LatentDirichletAllocation(n_components=3, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

display_topics(lda, tf_vectorizer, tf)

Topic 0:
life night day time did shall came know death love
-------And top documents:-------------
I dared, I conquered them all, till now I have sold myself to death, with the sole condition that thou shouldst follow me Fire, and war, and plague, unite for thy destruction O my Raymond, there is no safety for thee" With an heavy heart I listened to the changes of her delirium; I made her a bed of cloaks; her violence decreased and a clammy dew stood on her brow as the paleness of death succeeded to the crimson of fever, I placed her on the cloaks.

"O Aira, city of marble and beryl, how many are thy beauties How loved I the warm and fragrant groves across the hyaline Nithra, and the falls of the tiny Kra that flowed through the verdant valley In those groves and in that vale the children wove wreaths for one another, and at dusk I dreamed strange dreams under the yath trees on the mountain as I saw below me the lights of the city, and the curving Nithra reflecting a ribbon of stars.

A

В topic 0: има и 3те автори, т.е не успяхме да ги клъстеризираме по автори (ако бяхме успели щеше да е много наивно, но трябваше да се пробва)

Ще използвам намерените топици като feature-и.
Мога да пробвам само по индекс на топика, после мога да включа k-те срещани думи, а накрая от срещаните думи може
word2vec

In [None]:
def fill_topics(key, model, vectorizer, bag_words):
    train[key] = ""
    W = model.transform(bag_words)
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1]
        for doc_index in top_doc_indices:
            train[key][doc_index] = topic_idx
            
        

In [None]:
# fill_topics("lda", lda, tf_vectorizer, tf)
# fill_topics("nmf", nmf, tfidf_vectorizer, tfidf)

# train.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
# train["lda_0"] = train["lda"] == 0 + 0
# train["lda_1"] = train["lda"] == 1 + 0
# train["lda_2"] = train["lda"] == 2 + 0

# train["nmf_0"] = train["nmf"] == 0 
# train["nmf_1"] = train["nmf"] == 1 
# train["nmf_2"] = train["nmf"] == 2 

# train = train.drop(["lda","nmf"], axis=1)
# train.head(5)

KeyError: 'lda'

In [None]:
# from sklearn.naive_bayes import MultinomialNB

# pipeline = Pipeline([
#     ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
#                                  max_df=0.8, lowercase=False)),
#     ('clf', MultinomialNB(alpha=0.01))
# ])




    

# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
#                       scoring='neg_log_loss'))