# Spooky Author Identification
![](https://image.freepik.com/free-photo/halloween-pumpkin-on-a-wooden-table_1048-3118.jpg )


Да погледнем правилата на състезанието:

https://www.kaggle.com/c/spooky-author-identification


In [1]:
# Dataset
import pandas as pd
import numpy as np
import matplotlib as plt

train = pd.read_csv("data/train.csv", index_col=['id'])
test = pd.read_csv("data/test.csv", index_col=['id'])

print(train.shape, test.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1)
{'author'}


In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

params_count_word = {"features__ngram_range": [(1,1), (1,2), (1,3)],
                      "features__analyzer": ['word'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

params_count_char = {"features__ngram_range": [(1,4), (1,5), (1,6)],
                      "features__analyzer": ['char'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

In [39]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [40]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
def random_search():
    params = {
        "clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
        "clf__class_weight": [None, 'balanced']
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', CountVectorizer()),
        ('clf', LogisticRegression())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

# random_search()

Model with rank: 1
Mean validation score: -0.475 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 1), 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': None, 'clf__C': 1}

Model with rank: 2
Mean validation score: -0.482 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': None, 'clf__C': 1}

Model with rank: 3
Mean validation score: -0.486 (std: 0.001)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 1), 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 3}

Model with rank: 4
Mean validation score: -0.508 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__lowercase': False, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 0.3}

Model with rank: 5
Mean validation score: -0.525 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 0.3}


Търсенето отнеме много време, заради това за `char-grams` ще пусна само едно трениране и оценяване с по-стандартни стойности на хипер параметрите.

In [41]:
pipeline = Pipeline([
    ('features', CountVectorizer(ngram_range=(3,5), analyzer='char')),
    ('clf', LogisticRegression())
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))

[ 0.81648284  0.81152314  0.82022989]
[-0.57632589 -0.5762516  -0.54685003]


По-лоши резултати с chars - няма да го изследваме.

За сметка на това ще пробваме да заменим `CountVectorizer` с по-големия му батко `Tfidf`.

# Tfidf = Term-frequency inverse document-frequency

* Идеята е да сложи тежести и значимост на всички думи или n-grams.
* Напр. "новина" е доста често срещана дума и може да бъде в различни контексти. 
* За разлика от "електроенцефалограф", която е много по-рядко срещана и директно дава медицински контекст.


* TF брои колко пъти се среща думата в текущия текст (пасаж, изречение, документ, семпъл).
* IDF брои колко пъти тази дума се среща изцяло в корпуса с които тренираме.

# Има и формула 

$$
    \operatorname {tfidf} (w,d) = \operatorname{tf} * \log \Big(  \frac{n+1}{n_w + 1} \Big )  + 1
$$

където:

* $w$ - конкретна дума
* $d$ - документ на които правим трансформация
* $n$ - бр. на документите в трейн сета
* $n_w$ - бр. на документите в които $w$ се среща
* $\operatorname{tf}$ - бр. срещанията на думата $w$ в документа $d$

# Примерно

* "екстраполирам" се среща 10 пъти в 1000 документа корпус.
* "за" се среща в 900 от 1000.


$$
\text{tfidf("екстраполирам", "екстраполирам нещо си")} = 1 * log(1001 / 11) + 1 = 5.51
$$

$$
\text{tfidf("за", "отиде да тича за нещо си... за да му дойде акъла")} = 2 * log(1001 / 901) + 1 = 1.21
$$

In [42]:
print(1 * np.log(1001 / 11) + 1)
print(2 * np.log(1001 / 901) + 1)

5.51085950652
1.21049904341


In [43]:
tfidf = TfidfVectorizer()
print(tfidf.fit_transform(corpus).todense())
print(tfidf.vocabulary_)

[[ 0.39505606  0.          0.79011212  0.          0.39505606  0.          0.
   0.          0.          0.25215917  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.64450299  0.          0.64450299  0.41137791  0.          0.          0.
   0.        ]
 [ 0.          0.40104275  0.          0.          0.          0.40104275
   0.          0.50867187  0.          0.          0.          0.
   0.50867187  0.40104275]
 [ 0.          0.34336615  0.          0.43551643  0.          0.34336615
   0.          0.          0.          0.27798449  0.43551643  0.43551643
   0.          0.34336615]]
{'make': 7, 'going': 5, 'dollars': 4, 'ot': 10, 'take': 11, 'to': 12, 'are': 1, 'lot': 6, 'care': 3, 'and': 0, 'money': 8, 'we': 13, 'of': 9, 'billions': 2}


In [44]:
CountVectorizer().fit(corpus).vocabulary_ == TfidfVectorizer().fit(corpus).vocabulary_ 

True

Горното сравнение ще рече, че `CountVectorizer` и `TfidfVectorizer` намирът един и същи речник или "торбата с думи".

Това е така защото `TfidfVectorizer` вътрешно ползва `CountVectorizer` а отгоре само добавя idf функционалността.

In [45]:
print(tfidf.idf_)

[ 1.91629073  1.51082562  1.91629073  1.91629073  1.91629073  1.51082562
  1.91629073  1.91629073  1.91629073  1.22314355  1.91629073  1.91629073
  1.91629073  1.51082562]


In [46]:
def random_search():
    params = {
        "clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
        "clf__class_weight": [None, 'balanced']
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

# random_search() # предишния най-добър резултат:  -0.475 

Model with rank: 1
Mean validation score: -0.469 (std: 0.005)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 1.0, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 10}

Model with rank: 2
Mean validation score: -0.471 (std: 0.006)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 3, 'features__max_df': 0.5, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': None, 'clf__C': 10}

Model with rank: 3
Mean validation score: -0.483 (std: 0.008)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 5, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 10}

Model with rank: 4
Mean validation score: -0.495 (std: 0.002)
Parameters: {'features__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.6, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 10}

Model with rank: 5
Mean validation score: -0.522 (std: 0.005)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 10, 'features__max_df': 0.5, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__class_weight': 'balanced', 'clf__C': 10}


Има леко подобрение в `LogLoss`.

Да пробваме да сменим и класификатора с друг класически за класификация на текст: `Naive Bayes`

In [47]:
def random_search():
    params = {
        "clf__alpha": [0.01, 0.1, 0.5, 1, 2]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

# random_search()  # Предишния най-добър резултат: -0.469

Model with rank: 1
Mean validation score: -0.423 (std: 0.003)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 2
Mean validation score: -0.465 (std: 0.003)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 1), 'features__min_df': 3, 'features__max_df': 0.9, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 3
Mean validation score: -0.469 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 5, 'features__max_df': 0.9, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__alpha': 0.1}

Model with rank: 4
Mean validation score: -0.495 (std: 0.002)
Parameters: {'features__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'features__ngram_range': (1, 3), 'features__min_df': 5, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.1}

Model with rank: 5
Mean validation score: -0.496 (std: 0.004)
Parameters: {'features__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'features__ngram_range': (1, 3), 'features__min_df': 5, 'features__max_df': 0.6, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Тук има още подобрение в метриката.

Искам да го пробвам и със stemming.

Освен това се вижда, че избира най-ниската предоставена стойност за `alpha`, може би трябва да пробвам с още по-ниски.

In [48]:
def random_search():
    params = {
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(explore.stemmed, train.author)
    report(random_search.cv_results_)
    
# random_search()  # -0.423

Model with rank: 1
Mean validation score: -0.438 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.6, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 2
Mean validation score: -0.443 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 3, 'features__max_df': 0.6, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__alpha': 0.05}

Model with rank: 3
Mean validation score: -0.453 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 2, 'features__max_df': 1.0, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 4
Mean validation score: -0.471 (std: 0.003)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 5, 'features__max_df': 1.0, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 5
Mean validation score: -0.472 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 5, 'features__max_df': 0.5, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.05}


# Откри приблизително същите параметри, но не успя да стигне напълно до същия резултат.

Ще използвам следния модел:

TfIdf + MultinomialNB, без стеминг на текста.

Mean validation score: -0.423 (std: 0.003)

Ще ползвам и следните параметри:

Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Последна проверка на този модел за `LogLoss` и `Accuracy`

In [62]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83195466  0.83466135  0.83187739]
[-0.42530307 -0.418245   -0.42500535]


In [73]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Normalizer
  

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[-2.64142297 -1.5838596  -2.74649462]


Нека да запишем какво пробваме
* 
    StandardScaller
    MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(15,), random_state=26)
    
    [ 0.8359375   0.83558075  0.83478927]
    [-0.46610875 -0.47354423 -0.4626415 ]
     


*   
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='adam', alpha=1e-5, random_state=26))
    
    [ 0.83287377  0.83343549  0.83417625]
    [-0.48226704 -0.49633421 -0.47903389]
    

*   
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))

    [ 0.83501838  0.83757279  0.83632184]
    [-0.46630463 -0.47214645 -0.46082846]
    

*    
    ('normalizer', Normalizer()),
    ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=26))
    [-2.64142297 -1.5838596  -2.74649462]


Невронната мрежа не се справи out of the box по-добре от naivebs, ще пробвам svm и след това стакинг

In [91]:
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


from sklearn.naive_bayes import GaussianNB
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('normalizer', Normalizer()),
    ('dense', DenseTransformer()),
    ('clf', GaussianNB())
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[-8.87015964 -9.0501544  -8.90862233]


Това не беше добра идея (todense), почти ми свърши паметта

In [113]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[
                                      ('mnb', MultinomialNB()),
                                      ('mnbTuned', MultinomialNB(alpha=0.01)),
                                    ], voting="soft")

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', voting)
])


print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83808211  0.83803249  0.83923372]
[-0.47992097 -0.4751916  -0.47437537]


VotingClassifier с MultinomialNb, LogisticRegression, RandomForest -> [-0.5660945  -0.56260754 -0.5608995 ]

VotingClassifier с MultinomialNb, LogisticRegression -> [-0.48491232 -0.48166042 -0.47702683]

VotingClassifier с MultinomialNb,MultinomialNb(alpha=0.01 -> [-0.48093573 -0.47613605 -0.47296766]

VotingClassifier с MultinomialNb,MultinomialNb(alpha=0.01 -> [-0.48093573 -0.47613605 -0.47296766]
    TfidfVectorizer : 
        lowercase = True [-0.48491232 -0.48166042 -0.47702683]
        lowercase = False [-0.47992097 -0.4751916  -0.47437537]

In [127]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(base_estimator=MultinomialNB(), n_estimators=50, random_state=26)

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', adaboost)
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[-1.09784003 -1.09784169 -1.09783464]


Adaboost не работи добре, когато модела, който му се подава е достатъчно добър. Това потвърждава че nb wins here

In [365]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l2', learning_rate='optimal', random_state=26)
calibrated_clf = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', calibrated_clf)
])

print(cross_val_score(pipeline, train.text, train.author, cv=3,scoring='neg_log_loss'))



[-0.46200346 -0.47072398 -0.45530661]


Горното е нещо като svc + predict_probа
penalty=elasticnet се справя по-зле отколкотко l2 регуларизация


### Нека да пробваме word2vec

In [280]:
# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip

with open("/home/radoslav/ML/glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0].decode("utf-8"): np.array( list( map(float,line.split()[1:]) ))
           for line in lines}
    
w2v

{'arden': array([ 0.55923  ,  0.62112  , -0.078735 , -0.10042  , -0.41103  ,
         0.31946  , -0.81441  , -1.0055   ,  0.31992  ,  0.0028103,
        -0.59399  , -0.0068218, -0.77508  , -1.1985   ,  0.13629  ,
         0.87051  ,  0.56806  ,  0.26125  ,  0.10008  ,  0.21765  ,
         0.24066  ,  0.50583  , -0.24744  ,  0.26792  ,  0.040942 ,
         0.16253  , -1.0125   , -0.035844 , -1.1926   , -0.39672  ,
        -0.78138  , -0.38101  , -0.37356  ,  0.14182  ,  0.65277  ,
        -0.37912  ,  0.13964  , -0.038398 ,  0.88246  , -0.22861  ,
         0.13582  ,  0.039775 , -0.93759  , -0.4371   ,  0.63446  ,
         0.57258  ,  0.60536  , -0.61115  ,  0.0516   , -0.10785  ]),
 'julieto': array([-1.2128  , -0.41596 ,  0.77763 ,  1.9324  , -0.16217 , -0.6702  ,
        -0.45819 ,  1.3288  , -1.4395  , -2.0331  ,  0.11579 ,  0.012424,
        -0.60048 , -0.49659 ,  0.036671, -0.51767 ,  1.2788  ,  1.312   ,
         0.65585 ,  0.50776 ,  0.18814 , -2.0575  , -1.5989  , -0.79537 ,
  

In [316]:
from sklearn.svm import SVC

class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 50

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in CountVectorizer().fit([words]).vocabulary_.keys() if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
  


In [326]:
MeanEmbeddingVectorizer(w2v).fit_transform(["text of me", "more text", "maskata"])

array([[  2.96476667e-01,   4.16796667e-01,  -1.09083500e-01,
         -2.77376667e-01,   6.15723333e-01,   1.46513333e-01,
         -8.28600000e-02,  -3.65036667e-01,  -3.01563333e-01,
          1.56476667e-01,  -1.79856000e-01,   3.94356667e-01,
         -1.13476667e-01,  -3.02346667e-01,   5.51693333e-01,
         -5.70900000e-02,  -2.46901667e-01,  -3.69876333e-01,
          1.55863333e-01,  -3.41371333e-01,  -5.01066667e-02,
          2.22073333e-01,   3.97290333e-01,   1.74400000e-01,
          7.90640000e-01,  -1.57374333e+00,  -8.32650000e-01,
          4.13666667e-02,   1.53416667e-01,  -7.50936667e-01,
          3.30063333e+00,  -1.70573333e-01,  -4.49246667e-01,
         -2.60870000e-01,  -1.49656667e-01,  -8.16168667e-02,
          3.06500000e-01,   3.47900000e-03,  -1.34223333e-01,
          2.95000000e-03,   5.02686667e-01,   4.27100000e-02,
         -6.22240000e-02,   4.67173333e-02,  -9.41233333e-02,
          3.33340000e-01,   2.18053333e-01,   6.93633333e-02,
        

In [357]:
from collections import defaultdict

class TfidfEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 50

    def fit(self, X, y=None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [358]:
TfidfEmbeddingVectorizer(w2v).fit_transform(["text of me", "more text", "maskata"])

array([[ -1.13205672e-02,   1.27625442e+00,   1.24842331e+00,
          1.03171819e+00,   2.17858951e-01,  -4.91140561e-01,
          4.32793429e-01,  -1.22902120e+00,  -1.29422748e-02,
         -1.28656898e-01,   3.61159958e-02,   5.49338191e-01,
         -5.49898992e-01,  -6.56790241e-01,   1.40432426e-01,
         -2.11383637e-01,  -7.90840410e-01,  -2.72199364e-01,
         -5.13745638e-01,  -4.65160727e-01,  -6.12493489e-01,
         -3.30666930e-01,   1.01837622e+00,   5.91328477e-01,
         -3.18106756e-01,  -4.65992807e-01,  -4.82740028e-01,
         -1.92993260e-01,   2.00519810e-01,  -5.45932113e-01,
          2.97634048e+00,   2.37669383e-02,  -2.87231767e-01,
          6.21834005e-01,  -8.36678487e-02,  -8.67400588e-01,
          7.39098942e-01,  -7.40250955e-01,   3.15137431e-01,
          2.39207474e-01,   1.36172654e+00,   3.04427250e-02,
          2.07133651e-01,   3.29205059e-02,  -4.60512630e-02,
          2.60976993e-01,   4.41704417e-01,  -6.38602747e-01,
        

In [359]:
pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("SVM", SVC(kernel="linear", probability=True))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-0.85990018 -0.86448015 -0.86179917]


In [315]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("NB", BernoulliNB())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-1.0756865  -1.08785424 -1.09047044]


In [324]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("Logistic",LogisticRegression())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-0.86196739 -0.86640587 -0.86254262]


In [362]:
pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("SVM", SVC(kernel="linear", probability=True))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))


[-0.97894637 -0.98463594 -0.97154666]


In [363]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("NB", BernoulliNB())])
print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-1.08016871 -1.09355578 -1.10567065]


In [364]:
from sklearn.naive_bayes import BernoulliNB

pipeline = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("Logistic",LogisticRegression(C=100))])

print(cross_val_score(pipeline, train.text, train.author, cv=3, 
                      scoring='neg_log_loss'))

[-0.96339548 -0.96941171 -0.96472623]


Хмм общо взето word2vec концепцията не дава по-добри резултати досега.

### Cheat it with spacy

# Трениране на модел и събмит


In [366]:
pipeline = pipeline.fit(train.text, train.author)

In [367]:
print(pipeline.predict_proba(test[:10].text))

[[  9.80873572e-02   8.95385915e-01   6.52672778e-03]
 [  8.85668383e-01   5.17430254e-03   1.09157314e-01]
 [  1.46774149e-02   1.57320473e-02   9.69590538e-01]
 [  6.67320465e-01   1.42424532e-02   3.18437081e-01]
 [  4.49129692e-01   4.95108960e-01   5.57613479e-02]
 [  8.94724594e-01   4.19147305e-03   1.01083933e-01]
 [  9.41997712e-01   4.02293290e-04   5.75999944e-02]
 [  8.04493073e-03   3.65583704e-01   6.26371365e-01]
 [  9.85298781e-01   1.25687745e-03   1.34443411e-02]
 [  9.39694577e-01   5.86861060e-02   1.61931744e-03]]


In [368]:
test_predictions = pipeline.predict_proba(test.text)

In [369]:
print(pipeline.classes_)

['Едгар' 'Мери' 'Хауърд']


In [370]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)

Unnamed: 0_level_0,EAP,MWS,HPL
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.098087,0.895386,0.006527
id24541,0.885668,0.005174,0.109157
id00134,0.014677,0.015732,0.969591
id27757,0.66732,0.014242,0.318437
id04081,0.44913,0.495109,0.055761
id27337,0.894725,0.004191,0.101084
id24265,0.941998,0.000402,0.0576
id25917,0.008045,0.365584,0.626371
id04951,0.985299,0.001257,0.013444
id14549,0.939695,0.058686,0.001619


In [371]:
submit_file.to_csv("submit_SGDClassifier.csv")