# Mental Health NLP Model

## Load and explore data

In [48]:
import gensim.models
import pandas as pd
import re
import gensim

df = pd.read_csv('data/winemag-data-130k-v2.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [49]:
df.info()
df.points.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


88     17207
87     16933
90     15410
86     12600
89     12226
91     11359
92      9613
85      9530
93      6489
84      6480
94      3758
83      3025
82      1836
95      1535
81       692
96       523
80       397
97       229
98        77
99        33
100       19
Name: points, dtype: int64

In [50]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,label
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,bad
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,bad
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,bad
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,bad
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,bad


# Classification by Quality (Good / Bad)

## TF/IDF Approach

In [51]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('model', MultinomialNB())])
predictor = pipe.fit(X_train, y_train)

In [53]:
from sklearn import metrics

predictions = pipe.predict(X_test)

y_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         bad       0.80      0.92      0.85     16201
        good       0.82      0.62      0.71      9794

    accuracy                           0.81     25995
   macro avg       0.81      0.77      0.78     25995
weighted avg       0.81      0.81      0.80     25995

Accuracy: 0.8061165608770917


The achieved Accuracy with the TF/IDF Vectorization is around 81% which is pretty good. But let's see if Word Embeddings can help to achieve a better score.

## Word Embedding Approach

### Self created Word Embedding

In [54]:
clean_txt = []

# TODO: find better cleaning approach!
for w in range(len(df.description)):
    # make text lower case
    desc = df['description'][w].lower()

    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)

    #remove tags
    desc = re.sub('&lt;/?.*?&gt;', ' &lt;&gt; ', desc)

    #remove digits and special chars
    desc = re.sub('(\\d|\\W)+', ' ', desc)
    clean_txt.append(desc)

df['clean_desc'] = clean_txt
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,label,clean_desc
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,bad,aromas include tropical fruit broom brimstone ...
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,bad,this is ripe and fruity a wine that is smooth ...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,bad,tart and snappy the flavors of lime flesh and ...
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,bad,pineapple rind lemon pith and orange blossom s...
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,bad,much like the regular bottling from this comes...


In [104]:
corpus = []
for col in df.clean_desc:
    word_list = col.split(' ')
    corpus.append(word_list)

# generate vectors from corpus
modelMinCountOne = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
modelMinCountTwo = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, workers=4)

## Evaluate Vocab Size diff based on min_count 1 or 2
print(f'With min_count=1 the Word Embedding has a vocabulary size of {len(modelMinCountOne.wv)} words.')
print(f'With min_count=2 the Word Embedding has a vocabulary size of {len(modelMinCountTwo.wv)} words.')


# let's go with min_count 1 for now. Since it has a 50% bigger vocabulary
modelMinCountOne.save('embeddings\description_emb.bin')
# modelMinCountTwo.save('embeddings\description_emb.model')

With min_count=1 the Word Embedding has a vocabulary size of 30463 words.
With min_count=2 the Word Embedding has a vocabulary size of 20243 words.


In [108]:
from MeanEmbeddingVectorizer import MeanEmbeddingVectorizer

# model = gensim.models.Word2Vec.load('embeddings\description_emb.model')
model = gensim.models.KeyedVectors.load('embeddings\description_emb.bin', mmap='r')
vectorizer = MeanEmbeddingVectorizer(model.wv)

X = df['clean_desc']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# TODO: what about cross validation?
# mean_embedding_vectorizer = MeanEmbeddingVectorizer(model)
# mean_embedded = mean_embedding_vectorizer.fit_transform(df['clean'])

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

# TODO: what different models can be used? Are there substantial performance differences?
pipe = Pipeline([('wordEmbeddingVectorizer', vectorizer), ('model', MultinomialNB())])
predictor = pipe.fit(X_train, y_train)

AttributeError: 'KeyedVectors' object has no attribute 'itervalues'

In [None]:
from sklearn import metrics

predictions = pipe.predict(X_test)

y_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

### Using the Google News Embedding

In [None]:
# TODO: Continue that later https://machinelearningmastery.com/develop-word-embeddings-python-gensim
# Other Pre Trained Embeddings: https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

#from gensim.models import KeyedVectors

#filename = 'GoogleNews-vectors-negative300.bin'
#model = KeyedVectors.load_word2vec_format(filename, binary=True)

# Classification by Reviewers

## Data Preparation

In [58]:
# find top 2 reviewers
df.taster_name.value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

In [59]:
filtered_df = df.query('taster_name == "Roger Voss" | taster_name == "Michael Schachner"')

filtered_df

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,label,clean_desc
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,bad,this is ripe and fruity a wine that is smooth ...
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,bad,blackberry and raspberry aromas show a typical...
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,bad,this dry and restrained wine offers spice in p...
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,bad,this has great depth of flavor with its fresh ...
11,11,France,"This is a dry wine, very spicy, with a tight, ...",,87,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Leon Beyer 2012 Gewurztraminer (Alsace),Gewürztraminer,Leon Beyer,bad,this is a dry wine very spicy with a tight tau...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129964,129964,France,"Initially quite muted, this wine slowly develo...",Domaine Saint-Rémy Herrenweg,90,,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Ehrhart 2013 Domaine Saint-Rémy Herren...,Gewürztraminer,Domaine Ehrhart,good,initially quite muted this wine slowly develop...
129965,129965,France,"While it's rich, this beautiful dry wine also ...",Seppi Landmann Vallée Noble,90,28.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,Pinot Gris,Domaine Rieflé-Landmann,good,while it s rich this beautiful dry wine also o...
129968,129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,good,well drained gravel soil gives this wine its c...
129969,129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss,good,a dry style of pinot gris this is crisp with s...


## TF/IDF Approach