In [73]:
from pprint import pprint
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import TruncatedSVD, PCA

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
from spacy.tokenizer import Tokenizer

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
train.shape, test.shape

((2586, 3), (288, 2))

In [5]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [6]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [7]:
train['category'].value_counts()

1    1637
2     449
3     300
4     200
Name: category, dtype: int64

# BASSline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train['description'], train['category'], test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((1732,), (854,))

In [7]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(
        loss='hinge', penalty='l2',
        alpha=1e-3, random_state=42,
        max_iter=20, tol=1e-3
    )),
])

In [8]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [9]:
predicted = text_clf.predict(X_test)

In [10]:
np.mean(predicted == y_test)

0.9285714285714286

### Tuning

In [11]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3)
}

In [12]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [13]:
gs_clf = gs_clf.fit(X_train[:400], y_train[:400])



In [14]:
gs_clf.best_score_

0.885

### Try randomforrest

In [15]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(
        max_depth=None, random_state=42,
        n_estimators=500
    )),
])

In [16]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [17]:
predicted = text_clf.predict(X_test)

In [18]:
np.mean(predicted == y_test)

0.8454332552693209

### LinearSVC

In [19]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

In [20]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                     

In [21]:
predicted = text_clf.predict(X_test)

In [22]:
np.mean(predicted == y_test)

0.9473067915690867

### NuSVC

In [25]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', NuSVC(nu=0.005, kernel='linear',
                 random_state=42)),
])

In [26]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 NuSVC(break_ties=False, cache_size=200, class_weight=None,
                       coef0=0.0, decision_function_shape='ovr', degree=3,
  

In [27]:
predicted = text_clf.predict(X_test)

In [28]:
np.mean(predicted == y_test)
# 0.944015444015444 - .005

0.9484777517564403

# Clean data and rerun NuSVC

In [69]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON') and (token.pos_ != 'NUM'):
            lemmas.append(token.lemma_.lower().strip())
    
    return lemmas

In [65]:
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', sublinear_tf=True)),
    ('clf', NuSVC(nu=0.005, kernel='linear',
                 random_state=42)),
])

In [66]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 NuSVC(break_ties=False, cache_size=200, class_weight=None,
                       coef0=0.0, decision_function_shape='ovr', de

In [67]:
predicted = text_clf.predict(X_test)

In [68]:
np.mean(predicted == y_test)
# 0.9536679536679536 0.005

0.9473067915690867

### get most popular words

In [70]:
tokens = X_train.apply(get_lemmas)

In [74]:
def counts(tokens):
    word_counts = Counter()

    for _, token in tokens.iteritems():
        word_counts.update(token)

    return dict(word_counts)

In [121]:
tokens_df = pd.DataFrame(tokens)

In [122]:
tokens_df['category'] = y_train

In [123]:
tokens_df.head()

Unnamed: 0,description,category
1414,"[progress, ultimate, goal, year, old, expressi...",1
1904,"[nose, mild, pleasant, sweet, rock, candy, cin...",3
1721,"[good, pittyvaich, special, release, roster, h...",1
115,"[distil, weller, wheated, bourbon, collection,...",2
1055,"[nuts, plum, dry, red, fruit, plump, berry, oa...",3


In [124]:
cat1_tokens = tokens_df[tokens_df['category'] == 1]
cat2_tokens = tokens_df[tokens_df['category'] == 2]
cat3_tokens = tokens_df[tokens_df['category'] == 3]
cat4_tokens = tokens_df[tokens_df['category'] == 4]

In [125]:
cat1_word_counts = pd.Series(counts(cat1_tokens['description'])).sort_values()
cat2_word_counts = pd.Series(counts(cat2_tokens['description'])).sort_values()
cat3_word_counts = pd.Series(counts(cat3_tokens['description'])).sort_values()
cat4_word_counts = pd.Series(counts(cat4_tokens['description'])).sort_values()
cat1_word_counts.shape, cat2_word_counts.shape, cat3_word_counts.shape, cat4_word_counts.shape

((4526,), (2022,), (1889,), (1708,))

In [126]:
word_counts_all = pd.concat([cat1_word_counts, cat2_word_counts, cat3_word_counts, cat4_word_counts], axis=1, sort=True)

In [127]:
word_counts_all = word_counts_all.fillna(0)
word_counts_all = word_counts_all.astype(int)

In [128]:
word_counts_all

Unnamed: 0,0,1,2,3
,242,35,9,25
$,13,5,0,9
+,3,0,0,0
-and,0,1,0,0
-at,1,0,0,0
-especially,1,0,0,0
-with,1,0,0,0
100ml,1,0,0,0
12-month,1,0,0,0
125th,1,0,0,0


In [129]:
word_counts_all['total'] = word_counts_all.sum(axis=1)

In [130]:
all_cols = [0,1,2,3]

In [131]:
for num in all_cols:
    word_counts_all[f'pct_total_col_{num}'] = word_counts_all.iloc[:, num] / word_counts_all['total']

In [132]:
word_counts_all.head()

Unnamed: 0,0,1,2,3,total,pct_total_col_0,pct_total_col_1,pct_total_col_2,pct_total_col_3
,242,35,9,25,311,0.778135,0.11254,0.028939,0.080386
$,13,5,0,9,27,0.481481,0.185185,0.0,0.333333
+,3,0,0,0,3,1.0,0.0,0.0,0.0
-and,0,1,0,0,1,0.0,1.0,0.0,0.0
-at,1,0,0,0,1,1.0,0.0,0.0,0.0


### what seperates categories

In [140]:
all_cols_set = set(all_cols)

for col in all_cols:
    other_cols = list(all_cols_set - set([col]))
    
    others = word_counts_all.iloc[:, other_cols]
    
    current = word_counts_all.iloc[:, col]
    
    word_counts_all[f'pct_diff_from_others_{col}'] = (current - others.sum(axis=1)) / word_counts_all['total'] 

In [239]:
word_counts_most = word_counts_all[word_counts_all['total'] > 5]
word_counts_most.shape

(1572, 13)

In [240]:
word_counts_most.sort_values(by='total', ascending=False).head(20)

Unnamed: 0,0,1,2,3,total,pct_total_col_0,pct_total_col_1,pct_total_col_2,pct_total_col_3,pct_diff_from_others_0,pct_diff_from_others_1,pct_diff_from_others_2,pct_diff_from_others_3
finish,797,248,148,68,1261,0.632038,0.196669,0.117367,0.053925,0.264076,-0.606661,-0.765266,-0.892149
note,642,169,88,74,973,0.659815,0.17369,0.090442,0.076053,0.31963,-0.652621,-0.819116,-0.847893
palate,694,97,93,55,939,0.739084,0.103301,0.099042,0.058573,0.478168,-0.793397,-0.801917,-0.882854
fruit,645,156,52,79,932,0.69206,0.167382,0.055794,0.084764,0.38412,-0.665236,-0.888412,-0.830472
oak,480,192,143,56,871,0.551091,0.220436,0.164179,0.064294,0.102181,-0.559127,-0.671642,-0.871412
sweet,451,120,165,117,853,0.528722,0.14068,0.193435,0.137163,0.057444,-0.71864,-0.61313,-0.725674
whisky,618,8,9,170,805,0.767702,0.009938,0.01118,0.21118,0.535404,-0.980124,-0.97764,-0.57764
nose,535,54,130,54,773,0.692109,0.069858,0.168176,0.069858,0.384217,-0.860285,-0.663648,-0.860285
vanilla,436,203,38,48,725,0.601379,0.28,0.052414,0.066207,0.202759,-0.44,-0.895172,-0.867586
year,463,143,37,41,684,0.676901,0.209064,0.054094,0.059942,0.353801,-0.581871,-0.891813,-0.880117


In [241]:
y_train_value_counts = y_train.value_counts(normalize=True)
y_train_value_counts

1    0.629908
2    0.176097
3    0.114319
4    0.079677
Name: category, dtype: float64

In [264]:
mask = []
for _, row in word_counts_most.iterrows():
    row_value_counts = row[[0,1,2,3]] / row['total']
    diff = sum(abs(row_value_counts.values - y_train_value_counts.values) > .1)
    if diff >= 1:
        mask.append(True)
    else:
        mask.append(False)

In [265]:
word_counts_most[mask]

Unnamed: 0,0,1,2,3,total,pct_total_col_0,pct_total_col_1,pct_total_col_2,pct_total_col_3,pct_diff_from_others_0,pct_diff_from_others_1,pct_diff_from_others_2,pct_diff_from_others_3
,242,35,9,25,311,0.778135,0.112540,0.028939,0.080386,0.556270,-0.774920,-0.942122,-0.839228
$,13,5,0,9,27,0.481481,0.185185,0.000000,0.333333,-0.037037,-0.629630,-1.000000,-0.333333
1980,5,0,0,1,6,0.833333,0.000000,0.000000,0.166667,0.666667,-1.000000,-1.000000,-0.666667
375ml,0,4,6,0,10,0.000000,0.400000,0.600000,0.000000,-1.000000,-0.200000,0.200000,-1.000000
aberfeldy,7,0,0,0,7,1.000000,0.000000,0.000000,0.000000,1.000000,-1.000000,-1.000000,-1.000000
abv,8,7,2,2,19,0.421053,0.368421,0.105263,0.105263,-0.157895,-0.263158,-0.789474,-0.789474
accent,14,0,0,1,15,0.933333,0.000000,0.000000,0.066667,0.866667,-1.000000,-1.000000,-0.866667
accentuate,13,1,0,0,14,0.928571,0.071429,0.000000,0.000000,0.857143,-0.857143,-1.000000,-1.000000
accompany,10,5,1,0,16,0.625000,0.312500,0.062500,0.000000,0.250000,-0.375000,-0.875000,-1.000000
acetone,2,0,0,4,6,0.333333,0.000000,0.000000,0.666667,-0.333333,-1.000000,-1.000000,0.333333


In [266]:
word_counts_most[mask].shape

(1363, 13)

In [268]:
good_words = word_counts_most[mask].index.tolist()

### Try with only good words

In [269]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON') and (token.pos_ != 'NUM'):
            fixed = token.lemma_.lower().strip()
            if fixed in good_words:
                lemmas.append(fixed)
    
    return lemmas

In [288]:
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=get_lemmas, sublinear_tf=True)),
    ('clf', NuSVC(nu=0.1, kernel='poly',
                 random_state=42)),
])

In [289]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function get_lemmas at 0x13cd60050>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 NuSVC(break_ties=False, cache_size=200, class_weight=None,
                       coef0=0.0, decisi

In [290]:
predicted = text_clf.predict(X_test)

In [291]:
np.mean(predicted == y_test)
# 0.8114754098360656 0.005
# 0.8266978922716628 0.01
# 0.860655737704918 0.05
# 0.9297423887587822 0.09
# 0.9262295081967213 0.05
# 0.9215456674473068 0.15

0.6639344262295082

# Do nothing else and submit

In [285]:
text_clf.fit(train['description'], train['category'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ter=-1, nu=0.005, probability=False,
   random_state=42, shrinking=True, tol=0.001, verbose=False))])

In [43]:
test_pred = text_clf.predict(test['description'])

In [44]:
test['category'] = test_pred

In [45]:
test[['id', 'category']].to_csv("./data/submission3.csv", index=None)

# try and figure out what i'm getting wrong

In [141]:
X_test[predicted != y_test]

43      Glenfarclas has a proven track record for agin...
565     Surprisingly lacking in oak intensity, given i...
2400    Hot oak and bubble gum in the nose, with a sli...
2072    The base bottling of the “new” I.W. Harper fro...
2377    Pale, almost peach color. Light aromas of cara...
866     Consisting of whiskies aged between 6 to 10 ye...
239     Amber color. This particular bottling (and rem...
1739    Aged 10 years in icewine barrels — a Canadian ...
233     Rich, multi-layered nose: vanilla, cornmeal, b...
1937    There’s a reason why bourbon distillers don’t ...
1462    An unabashedly spicy rye nose backed by cinnam...
2192    Deep golden in color with a pinkish cast, this...
869     Joining the core Pendleton lineup, with a new ...
1211    A blend of bourbon, rye, and peated single mal...
819     This high-end whiskey exhibits restrained oak,...
1650    Traditionally, this release has been a showcas...
353     Many distilleries have released bourbon finish...
2492    Big Bo

In [142]:
print(classification_report(y_test, predicted, target_names=["1", "2", "3", "4"]))

              precision    recall  f1-score   support

           1       0.98      1.00      0.99       325
           2       0.90      0.91      0.90        87
           3       0.88      0.86      0.87        66
           4       0.97      0.85      0.91        40

    accuracy                           0.95       518
   macro avg       0.93      0.90      0.92       518
weighted avg       0.95      0.95      0.95       518

