In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import lime
from lime import lime_text
stop = stopwords.words('english')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [2]:
all_wines = pd.read_csv('Data/winemag-data-130k-v2.csv')
all_wines.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
clean_wines = all_wines.drop(columns=['Unnamed: 0', 'taster_twitter_handle', 'region_2', 'price', 'province',
                                     'region_1', 'taster_name', 'title', 'variety', 'winery', 'country', 'designation'])
clean_wines = clean_wines.dropna()
clean_wines

Unnamed: 0,description,points
0,"Aromas include tropical fruit, broom, brimston...",87
1,"This is ripe and fruity, a wine that is smooth...",87
2,"Tart and snappy, the flavors of lime flesh and...",87
3,"Pineapple rind, lemon pith and orange blossom ...",87
4,"Much like the regular bottling from 2012, this...",87
...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,90
129967,Citation is given as much as a decade of bottl...,90
129968,Well-drained gravel soil gives this wine its c...,90
129969,"A dry style of Pinot Gris, this is crisp with ...",90


In [4]:
clean_wines = clean_wines.drop_duplicates('description')
clean_wines 

Unnamed: 0,description,points
0,"Aromas include tropical fruit, broom, brimston...",87
1,"This is ripe and fruity, a wine that is smooth...",87
2,"Tart and snappy, the flavors of lime flesh and...",87
3,"Pineapple rind, lemon pith and orange blossom ...",87
4,"Much like the regular bottling from 2012, this...",87
...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,90
129967,Citation is given as much as a decade of bottl...,90
129968,Well-drained gravel soil gives this wine its c...,90
129969,"A dry style of Pinot Gris, this is crisp with ...",90


In [5]:
clean_wines['grade'] = clean_wines['points'].apply(lambda x: 'A' if x >= 91 else
                                                             'B' if x >= 88 else
                                                             'C' if x >= 86 else
                                                             'D' if x >= 83 else 'F')
clean_wines

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,description,points,grade
0,"Aromas include tropical fruit, broom, brimston...",87,C
1,"This is ripe and fruity, a wine that is smooth...",87,C
2,"Tart and snappy, the flavors of lime flesh and...",87,C
3,"Pineapple rind, lemon pith and orange blossom ...",87,C
4,"Much like the regular bottling from 2012, this...",87,C
...,...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,90,B
129967,Citation is given as much as a decade of bottl...,90,B
129968,Well-drained gravel soil gives this wine its c...,90,B
129969,"A dry style of Pinot Gris, this is crisp with ...",90,B


In [6]:
clean_wines['good_bad'] = clean_wines['points'].apply(lambda x: 'Good' if x >= 88 else
                                                     'Poor')
clean_wines

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,description,points,grade,good_bad
0,"Aromas include tropical fruit, broom, brimston...",87,C,Poor
1,"This is ripe and fruity, a wine that is smooth...",87,C,Poor
2,"Tart and snappy, the flavors of lime flesh and...",87,C,Poor
3,"Pineapple rind, lemon pith and orange blossom ...",87,C,Poor
4,"Much like the regular bottling from 2012, this...",87,C,Poor
...,...,...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,90,B,Good
129967,Citation is given as much as a decade of bottl...,90,B,Good
129968,Well-drained gravel soil gives this wine its c...,90,B,Good
129969,"A dry style of Pinot Gris, this is crisp with ...",90,B,Good


In [7]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [8]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
clean_wines["description_clean"] = clean_wines["description"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
clean_wines.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,description,points,grade,good_bad,description_clean
0,"Aromas include tropical fruit, broom, brimston...",87,C,Poor,aroma include tropical fruit broom brimstone d...
1,"This is ripe and fruity, a wine that is smooth...",87,C,Poor,ripe fruity wine smooth still structured firm ...
2,"Tart and snappy, the flavors of lime flesh and...",87,C,Poor,tart snappy flavor lime flesh rind dominate gr...
3,"Pineapple rind, lemon pith and orange blossom ...",87,C,Poor,pineapple rind lemon pith orange blossom start...
4,"Much like the regular bottling from 2012, this...",87,C,Poor,much like regular bottling 2012 come across ra...


In [10]:
list_of_texts = clean_wines['description_clean']
def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', '', tokens.lower())
    # This alternative just removes numbers:
    # r = re.sub('(\d)+', '', tokens.lower())
    return r
no_num_txts = []
for t in list_of_texts:
    no_num_t = no_number_preprocessor(t)
    no_num_txts.append(no_num_t)

clean_wines['description_cleaner'] = no_num_txts
clean_wines

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,description,points,grade,good_bad,description_clean,description_cleaner
0,"Aromas include tropical fruit, broom, brimston...",87,C,Poor,aroma include tropical fruit broom brimstone d...,aroma include tropical fruit broom brimstone d...
1,"This is ripe and fruity, a wine that is smooth...",87,C,Poor,ripe fruity wine smooth still structured firm ...,ripe fruity wine smooth still structured firm ...
2,"Tart and snappy, the flavors of lime flesh and...",87,C,Poor,tart snappy flavor lime flesh rind dominate gr...,tart snappy flavor lime flesh rind dominate gr...
3,"Pineapple rind, lemon pith and orange blossom ...",87,C,Poor,pineapple rind lemon pith orange blossom start...,pineapple rind lemon pith orange blossom start...
4,"Much like the regular bottling from 2012, this...",87,C,Poor,much like regular bottling 2012 come across ra...,much like regular bottling come across rather...
...,...,...,...,...,...,...
129966,Notes of honeysuckle and cantaloupe sweeten th...,90,B,Good,note honeysuckle cantaloupe sweeten deliciousl...,note honeysuckle cantaloupe sweeten deliciousl...
129967,Citation is given as much as a decade of bottl...,90,B,Good,citation given much decade bottle age prior re...,citation given much decade bottle age prior re...
129968,Well-drained gravel soil gives this wine its c...,90,B,Good,welldrained gravel soil give wine crisp dry ch...,welldrained gravel soil give wine crisp dry ch...
129969,"A dry style of Pinot Gris, this is crisp with ...",90,B,Good,dry style pinot gris crisp acidity also weight...,dry style pinot gris crisp acidity also weight...


In [None]:
list_of_descriptions = clean_wines['description_cleaner']
adjs = []
for t in list_of_descriptions:
    is_adj = lambda pos: pos[:2] == 'JJ'
    tokenized = nltk.word_tokenize(t)
    adj = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adj(pos)]
    adjs.append(adj)
clean_wines['adjs'] = adjs

In [None]:
clean_wines

In [None]:
clean_wines.to_csv('Data/clean_wines_text.csv')

In [None]:
adj_list = []
adjs = clean_wines['adjs']
for t in adjs:
    arr = np.array(clean_wines['adjs'])
    adj_list.append(arr)
clean_wines['adjectives'] = adj_list
clean_wines

In [None]:
clean_wines.to_csv('Data/clean_wines_text.csv')

In [None]:
clean_wines = pd.read_csv('Data/clean_wines_text.csv')
clean_wines

In [12]:
dtf = clean_wines[['grade', 'description_clean', 'description_cleaner', 'good_bad']]
dtf

Unnamed: 0,grade,description_clean,description_cleaner,good_bad
0,C,aroma include tropical fruit broom brimstone d...,aroma include tropical fruit broom brimstone d...,Poor
1,C,ripe fruity wine smooth still structured firm ...,ripe fruity wine smooth still structured firm ...,Poor
2,C,tart snappy flavor lime flesh rind dominate gr...,tart snappy flavor lime flesh rind dominate gr...,Poor
3,C,pineapple rind lemon pith orange blossom start...,pineapple rind lemon pith orange blossom start...,Poor
4,C,much like regular bottling 2012 come across ra...,much like regular bottling come across rather...,Poor
...,...,...,...,...
129966,B,note honeysuckle cantaloupe sweeten deliciousl...,note honeysuckle cantaloupe sweeten deliciousl...,Good
129967,B,citation given much decade bottle age prior re...,citation given much decade bottle age prior re...,Good
129968,B,welldrained gravel soil give wine crisp dry ch...,welldrained gravel soil give wine crisp dry ch...,Good
129969,B,dry style pinot gris crisp acidity also weight...,dry style pinot gris crisp acidity also weight...,Good


In [13]:
dtf_train, dtf_test = model_selection.train_test_split(dtf, test_size=0.3)
## get target
y_train = dtf_train["good_bad"].values
y_test = dtf_test["good_bad"].values

In [14]:
vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

In [None]:
corpus = dtf_train["description_cleaner"]
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [None]:
#sns.heatmap(X_train.todense()[:,np.random.randint(0,X_train.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')

In [None]:
y = dtf_train["good_bad"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.95
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "good_bad":cat}))
    dtf_features = dtf_features.sort_values(["g00","score"], 
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
X_names = dtf_features["feature"].unique().tolist()

In [None]:
for cat in np.unique(y):
   print("# {}:".format(cat))
   print("  . selected features:",
         len(dtf_features[dtf_features["grade"]==cat]))
   print("  . top features:", ",".join(
dtf_features[dtf_features["grade"]==cat]["feature"].values[:30]))
   print(" ")

In [None]:
vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [None]:
classifier = naive_bayes.MultinomialNB()

In [None]:
## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = dtf_test["description_cleaner"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [None]:
classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values
y_test_array

In [None]:
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test_array, predicted_prob, 
                            multi_class="one_vs_rest")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, predicted))

In [None]:
cm = metrics.confusion_matrix(y_test, predicted)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, 
            cbar=False)
ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, 
       yticklabels=classes, title="Confusion matrix")
plt.yticks(rotation=0)


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2)
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area={1:0.2f})'.format(classes[i], 
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)

for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area={1:0.2f})'.format(classes[i], 
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

In [None]:
## select observation
i = 17
txt_instance = dtf_test["description_cleaner"].iloc[i]
## check true value and predicted value
print("True:", y_test[i], "--> Pred:", predicted[i], "| Prob:", round(np.max(predicted_prob[i]),2))
## show explanation
explainer = lime_text.LimeTextExplainer(class_names=
             np.unique(y_train))
explained = explainer.explain_instance(txt_instance, 
             model.predict_proba, num_features=5)
explained.show_in_notebook(text=txt_instance, predict_proba=False)

In [None]:
d = {'prediction' : predicted, 
     'actual' : y_test}

In [None]:
text_predictions_df = pd.DataFrame(data=d)
text_predictions_df

In [None]:
text_predictions_df.to_csv('Data/text_predictions_binary.csv')