In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 300
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

### Importing the data set from the Exploratory Analysis

In [2]:
#Import the cleaned dataset from the exploration analysis.
df = pd.read_csv('/Users/rsylvetsky/Desktop/df_cln.csv')
#When you use read_csv the cln_words Series is read as a string
df.dtypes

Unnamed: 0     int64
variety       object
cln_words     object
dtype: object

In [3]:
#Quick check to see how many observations I'm using
len(df)


108644

### Creating a new feature from Descriptions with stemmed words

#### In order to combine similar words like "fruit" and "fruity," I'm incorporating stemming into my dataset


In [4]:
stemmer = SnowballStemmer("english")
df['stemmed'] = df.cln_words.map(lambda x: [stemmer.stem(y) for y in eval(x)])


In [5]:
df.head(1)
#Quick peek at the stemmed descriptions

Unnamed: 0.1,Unnamed: 0,variety,cln_words,stemmed
0,0,white blend,"['aromas', 'include', 'tropical', 'fruit', 'broom', 'brimstone', 'dried', 'herb', 'palate', 'overly', 'expressive', 'offering', 'unripened', 'apple', 'citrus', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '']","[aroma, includ, tropic, fruit, broom, brimston, dri, herb, palat, over, express, offer, unripen, appl, citrus, dri, sage, alongsid, brisk, acid, ]"


In [6]:
df['stemstr'] = df.stemmed.apply(' '.join)
#Now we have a string of stemmed descriptions per observation to fit the model.

In [7]:

df.head(1)
#Quick look at the dataset we're not working with 

Unnamed: 0.1,Unnamed: 0,variety,cln_words,stemmed,stemstr
0,0,white blend,"['aromas', 'include', 'tropical', 'fruit', 'broom', 'brimstone', 'dried', 'herb', 'palate', 'overly', 'expressive', 'offering', 'unripened', 'apple', 'citrus', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '']","[aroma, includ, tropic, fruit, broom, brimston, dri, herb, palat, over, express, offer, unripen, appl, citrus, dri, sage, alongsid, brisk, acid, ]",aroma includ tropic fruit broom brimston dri herb palat over express offer unripen appl citrus dri sage alongsid brisk acid


### Splitting up the data so we can train and test on separate data to avoid overfitting.


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['stemstr'], df['variety'], test_size=0.2)

X_train.shape
X_test.shape



(21729,)

With my first model, I'm going to train my models on both CountVectorized and TfIdf Vectorized descriptions. Since the descriptions are so short, maybe TfIdf isn't really necessary. It could help weight more frequent keywords (eg 'taste') accordingly which could be interesting.

In [9]:
tfidfv = TfidfVectorizer()
cv = CountVectorizer()

In [10]:
X_train_idf = tfidfv.fit_transform(X_train)
X2_train_cv = cv.fit_transform(X_train)
X_train_idf.shape, X2_train_cv.shape

((86915, 19374), (86915, 19374))

In [11]:
y_train.shape

(86915,)

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score

model_MNB_idf = MultinomialNB()
model_MNB_cv = MultinomialNB()


model_MNB_idf.fit(X_train_idf,y_train)
model_MNB_cv.fit(X2_train_cv,y_train)




MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
MNB_idf_scores = cross_val_score(model_MNB_idf, X_train_idf,y_train, cv=5)

#MNB_idf_scores
MNB_idf_scores.mean()
#Cross Val Accuracy score with Training Data using TFIDF

0.38603549768088607

In [14]:
MNB_cv_scores = cross_val_score(model_MNB_cv, X2_train_cv, y_train, cv=5)
#MNB_cv_scores
MNB_cv_scores.mean()
#Cross Val Accuracy score with Training Data using Count Vectorization

0.50631214940424663

In [15]:
X2_test_cv = cv.transform(X_test)
#Transforming the Test data sets 

In [16]:
model_MNB_cv.score(X2_test_cv,y_test)
#Accuracy score on Test data set for CV. 

0.51659073128077682

The first iteration using Multinomial Naive Bayes was pretty interesting. I learned that Count Vectorization performs better as a way to vectorize my description data.


### Exploring the coefficients from the Count Vectorized Multinominal Naive Bayes model.

Similar to the analysis we did with News Stories, I'm curious to see which words per wine have the largest coefficients and therefore contribute the most weight to the wine variety's classification. Perhaps the words with the smallest coefficients should be stripped out or cleaned.

In [17]:
y_names = y_train.unique()
coef = pd.DataFrame(model_MNB_cv.coef_, columns=cv.get_feature_names(),index=y_names).T

In [19]:
top = 4
for wine_variety in coef:
    s = coef[[wine_variety]].sort_values(wine_variety)
    print "%-20s  (+) %s" % (wine_variety, ", ".join(s.iloc[-top:].index))
    print "%-20s  (-) %s" % ("", ", ".join(s.iloc[:top].index))
    

chardonnay
chardonnay            (+) wine, tannin, aroma, black
                      (-) 000, pepella, pepe, pep
pinot noir
pinot noir            (+) appl, finish, aroma, flavor
                      (-) zy, overabund, overachiev, weinviertel
sangiovese
sangiovese            (+) fruit, cherri, aroma, wine
                      (-) 000, pea, pdx, pdi
tempranillo blend
tempranillo blend     (+) cherri, fresh, wine, fruit
                      (-) 000, perdiz, perdig, percol
tempranillo
tempranillo           (+) flavor, tannin, fruit, wine
                      (-) zy, moncayo, monchiero, moncontour
rosé


UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

I ran into an encoding issue with the accents on my wine. I unfortunately wasn't able to get this fixed but I still think the words for each wine variety I can see above align with what I might expect for a tempranillo or pinot noir. I realized that even after removing wine names & stop words, I still need to remove more wine-domain type vocabulary that I think may be hurting my prediction accuracy score. I'm going to run through this process another time with an improved description set. 

In [None]:
#uploading a secondary df file with some add'l words stripped out already.
df2 = pd.read_csv('/Users/rsylvetsky/Desktop/df_cln2.csv')
df2['stemmed'] = df2.cln_words.map(lambda x: [stemmer.stem(y) for y in eval(x)])
df2['stemstr'] = df2.stemmed.apply(' '.join)

In [21]:

bad_lst = ['flavor', 'learn','000']
for (x,y) in df2.stemstr.iteritems():
    y = y.split()
    for word in y:
        if word in bad_lst:
            word = word.replace(word,'')
        else: 
            pass

In [22]:



X_train2, X_test2, y_train2, y_test2 = train_test_split(df2['stemstr'], df2['variety'], test_size=0.2)

X_test_cv2 = cv.transform(X_test2)

X_train_cv2 = cv.transform(X_train2)

model_MNB_cv2 = MultinomialNB()

model_MNB_cv2.fit(X_train_cv2,y_train2)

MNB_cv_scores2 = cross_val_score(model_MNB_cv2, X_train_cv2,y_train2, cv=5, scoring ='accuracy')
MNB_cv_scores2.mean()

0.50253880428063158

In [23]:
top = 4
coef = pd.DataFrame(model_MNB_cv2.coef_, columns=cv.get_feature_names(),index=y_names).T
for wine_variety in coef:
    s = coef[[wine_variety]].sort_values(wine_variety)
    print "%-20s  (+) %s" % (wine_variety, ", ".join(s.iloc[-top:].index))
    print "%-20s  (-) %s" % ("", ", ".join(s.iloc[:top].index))

chardonnay            (+) spice, cherri, tannin, black
                      (-) 000, pepperd, pepperbridg, pepin
pinot noir            (+) acid, fruit, appl, finish
                      (-) zy, overarch, overblown, overburden
sangiovese            (+) acid, black, fruit, cherri
                      (-) 000, peac, pea, pdx
tempranillo blend     (+) cherri, tannin, fresh, fruit
                      (-) 000, pereni, peregrin, pere
tempranillo           (+) drink, black, tannin, fruit
                      (-) zy, treborc, meeker, trebbiano


UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

There's definitely still some improvements I want to make to the original text. For now, I'm going to continue experimenting with models and parameter optimization. If there's still time, I'll try to improve the keywords again.

### Gaussian Naive Bayes model

While Gaussian is better for 

In [24]:

from sklearn.naive_bayes import GaussianNB

model_GNB = GaussianNB()
#model_GNB.fit(X_train_cv2.toarray(),y_train2)

cross_val_score(model_GNB, X_train_cv2.toarray(),y_train2, cv=5)


array([ 0.19991957,  0.20294354,  0.2038539 ,  0.20389132,  0.20706343])

### Support Vector Model 

In [141]:
from sklearn import svm

#model_svm = svm.SVC()
#model_svm.fit(X_train_cv2,y_train2)
#smv_cv_scores = cross_val_score(model_svm, X_train_cv2,y_train2, cv=5)
#smv_cv_scores.mean()

While I had read that Support Vector Machines work well with text features, the SVM took way too long to compute. I wasn't able to get an accuracy score in the timeframe I needed and unfortunately had to move on.

### Playing around with a Random Forest Classifier to see what the accuracy look like

In [80]:
from sklearn.ensemble import RandomForestClassifier


model_rfc = RandomForestClassifier()
model_rfc.fit(X_train_cv2,y_train2)
RFC_cv_scores = cross_val_score(model_rfc, X_train_cv2,y_train2, cv=5)
RFC_cv_scores.mean()

array([ 0.42196565,  0.42159117,  0.42173388,  0.42237062,  0.4218714 ])

### Also wanted to try a Gradient Boosting Classifier to see what the accuracy look like


This model unfortunately didn't finish in time for me to evaluate its performance.

In [25]:
from sklearn.ensemble import GradientBoostingClassifier


model_GBC = GradientBoostingClassifier(n_estimators=1000)
model_GBC.fit(X_train_cv2,y_train2)
GBD_cv_scores = cross_val_score(model_GBC, X_train_cv2,y_train2, cv=5)
GBD_cv_scores.mean()

KeyboardInterrupt: 

### I'm going to try this Stochastic Gradient Classifier (an optimized Logistic Regression), which I read on another kaggle exercise may be successful

In [175]:
from sklearn.linear_model import SGDClassifier

model_SGD = SGDClassifier()
model_SGD.fit(X_train_cv2,y_train2)
SGD_cv_scores = cross_val_score(model_SGD, X_train_cv2,y_train2, cv=5, scoring='accuracy')
SGD_cv_scores.mean()

0.5600197855980934

This is the highest accuracy score yet. Still not great but I'm going to explore the hyper parameter optimization to see how much I can improve it. 

### Hyper Parameter Optimization using GridSearchCV & SGDClassifer

The following parameter optimization produced the best accuracy score. The below code was from an sklearn documentation page example.

In [39]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
#]
# Uncomment the following to do the analysis on all the categories
categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = X_train2
print("%d documents" % len(X_train2))
print("%d categories" % len(y_train2))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.50,0.75,1),
    #'vect__min_df': (0.10,0.20,0.30),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50,80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X_train2, y_train2
)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
None
86915 documents
86915 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 20.7min finished


done in 1265.151s

Best score: 0.596
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


Okay okay, this is better.. One thing I'm not sure about here is that the above optimization script currently seems to output 86,915 categories. Does this mean that it's not recognizing each 'malbec' as part of the general 'malbec' category. I'm also a little unsure how to combine the CV & TfIdf vectorization techniques.

### Fitting the final optimized model and testing the accuracy on my test data 

After the gridsearch paramter optimizations from both a CV perspective and SGD Classifier perspective, I fit my CV & SGD Classifer.

In [49]:
cv_opt = CountVectorizer(ngram_range =(1,2),max_df = 0.75,)
cv_opt_fit = cv_opt.fit(X_train2)
X_trained = cv_opt_fit.transform(raw_documents =X_train2)
#X_trained = cv_opt.transform(X_train2)
model_SGD_optimized = SGDClassifier(alpha=0.00001, penalty = 'l2')
model_SGD_optimized.fit(X_trained,y_train2)





SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

After optimizing with the preferred parameters, this is performing a bit worse  compared to the non-optimized SGD & the optimized SGD during hyper parameter optimization. 

In [50]:
X_test3 = cv_opt_fit.transform(X_test2)

model_SGD_optimized.score(X_test3,y_test2)
#Accuracy score on the Test data

0.55828616135118969