In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('review_data.csv')

In [3]:
dataset.head()

Unnamed: 0,review_description,variety,wine_type
0,"Classic Chardonnay aromas of apple, pear and h...",17,Chardonnay
1,This wine is near equal parts Syrah and Merlot...,20,Red Blend
2,Barolo Conca opens with inky dark concentratio...,26,Nebbiolo
3,It's impressive what a small addition of Sauvi...,15,Bordeaux-style White Blend
4,"This ripe, sweet wine is rich and full of drie...",13,Malbec


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import re
ps = PorterStemmer()
corpus = []

for _ in range(len(dataset['review_description'])):
    review = re.sub('[^a-zA-Z]',' ',dataset['review_description'][_])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [17]:
corpus

['classic chardonnay aroma appl pear hay lead palat mark decent intens also bit sweet orang candi note run rather short finish',
 'wine near equal part syrah merlot balanc cabernet sauvignon aroma blue fruit vanilla cherri herb lead full bodi pit fruit flavor bring sens delici hard resist',
 'barolo conca open inki dark concentr sooth aroma black fruit spice cola plum prune dri lavend bud nose present ether delic aroma mouth deliv thicker flavor chocol mocha drink',
 'impress small addit sauvignon gri muscadel sauvignon millon blend turn exot wine spice lyche flavor',
 'ripe sweet wine rich full dri fresh fruit flavor spici touch spirit palat well bold black plum fruit dens tannin sweet aftertast drink',
 'strident aroma licoric herb berri mint make interest palat light fresh raspberri red currant flavor lot heft extract depth wine likabl unhind excess oak enjoy due unchalleng simplic',
 'half syrah half cabernet sauvignon new french oak wine draw note blackberri milk chocol anis broad

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(max_features = 5000, ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()

In [19]:
TfidfTransformer().fit(X)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [20]:
y = dataset.iloc[:,-2]

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33, random_state = 0)

## Multinonimal NB

In [22]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha = 0.1)

### Model Accuracy Check

In [23]:
from sklearn import metrics
import itertools
from sklearn.metrics import accuracy_score 

In [25]:
previous_score = 0
for alpha in np.arange(0,1.1,0.1):
    sub_classifier = MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred = sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test,y_pred)
    if score > previous_score:
        classifier = sub_classifier
        previous_score = score
    print('Alpha: {}   Score: {}%'.format(alpha,score*100))

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.0   Score: 55.13826743930169%
Alpha: 0.1   Score: 57.9219540820069%
Alpha: 0.2   Score: 57.80092422797624%
Alpha: 0.30000000000000004   Score: 57.767916085967876%
Alpha: 0.4   Score: 57.90728379666985%
Alpha: 0.5   Score: 57.95129465268099%
Alpha: 0.6000000000000001   Score: 58.04665150737181%
Alpha: 0.7000000000000001   Score: 58.16768136140248%
Alpha: 0.8   Score: 58.33272207144429%
Alpha: 0.9   Score: 58.457419496809216%
Alpha: 1.0   Score: 58.53443849482872%


### Saving the Model

In [26]:
classifier.fit(X,y.astype(int))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
from sklearn.externals import joblib
joblib_file = 'NBclassifier_model'
joblib.dump(classifier,joblib_file)

['NBclassifier_model']