# Mental Health NLP Model

## Load and explore data

In [4]:
import gensim.models
import numpy as np
import pandas as pd
import re
import gensim

df = pd.read_csv('data/winemag-data-130k-v2.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
df.info()
df.points.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


88     17207
87     16933
90     15410
86     12600
89     12226
91     11359
92      9613
85      9530
93      6489
84      6480
94      3758
83      3025
82      1836
95      1535
81       692
96       523
80       397
97       229
98        77
99        33
100       19
Name: points, dtype: int64

In [6]:
# split points into binary label (80-89 = bad, 90-99 = good)
df['label'] = df['points'].apply(lambda x: 'good' if x > 89 else 'bad')

df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,label
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,bad
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,bad
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,bad
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,bad
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,bad


# Classification by Quality (Good / Bad)

## TF/IDF Approach

In [7]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('model', MultinomialNB())])
predictor = pipe.fit(X_train, y_train)

In [9]:
from sklearn import metrics

predictions = pipe.predict(X_test)

y_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

              precision    recall  f1-score   support

         bad       0.80      0.92      0.85     16201
        good       0.82      0.62      0.71      9794

    accuracy                           0.81     25995
   macro avg       0.81      0.77      0.78     25995
weighted avg       0.81      0.81      0.80     25995

Accuracy: 0.8061165608770917


The achieved Accuracy with the TF/IDF Vectorization is around 81% which is pretty good. But let's see if Word Embeddings can help to achieve a better score.

## Word Embedding Approach

### Self created Word Embedding

In [10]:
clean_txt = []

# TODO: find a better cleaning approach!
for w in range(len(df.description)):
    # make text lower case
    desc = df['description'][w].lower()

    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)

    #remove tags
    desc = re.sub('&lt;/?.*?&gt;', ' &lt;&gt; ', desc)

    #remove digits and special chars
    desc = re.sub('(\\d|\\W)+', ' ', desc)
    clean_txt.append(desc)

df['clean_desc'] = clean_txt
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,label,clean_desc
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,bad,aromas include tropical fruit broom brimstone ...
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,bad,this is ripe and fruity a wine that is smooth ...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,bad,tart and snappy the flavors of lime flesh and ...
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,bad,pineapple rind lemon pith and orange blossom s...
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,bad,much like the regular bottling from this comes...


In [49]:
corpus = []
for col in df.clean_desc:
    word_list = col.split(' ')
    corpus.append(word_list)

In [51]:
model = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
print(f'The word embedding has a vocabulary size of {len(model.wv)} words.')

model.save('embeddings\description_emb.bin')

The word embedding has a vocabulary size of 30463 words.


In [45]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

X = df['clean_desc']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def sentence_to_vector(sentence, word_embedding):
    vector = []
    for word in sentence.split():
        if word in word_embedding.wv:
            vector.append(word_embedding.wv[word])
    return np.mean(vector, axis=0)

def train_bernoulli(word_embedding_path):
    word_embedding = gensim.models.KeyedVectors.load(word_embedding_path)

    train_vectors = [sentence_to_vector(sentence, word_embedding) for sentence in X_train]
    test_vectors = [sentence_to_vector(sentence, word_embedding) for sentence in X_test]

    nb_model = BernoulliNB()
    nb_model.fit(train_vectors, y_train)

    y_pred = nb_model.predict(test_vectors)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    return nb_model

In [48]:
model_min1_vs100 = train_bernoulli('embeddings\description_emb.bin')

Accuracy: 0.6700134641277169


So we have an accuracy of around 67%. This is worse than the tfidf vectorizer. We have multiple approaches to fix this. We could try to finetune the parameters of our word embedding, we could use other prebuilt word embeddings or we could also use other models than the BernoulliNB. Since the word embedding that we just built was rather simple we should start by improving on it first.

In [52]:
# Let's create word embeddings with different parameters

modelMinCountTwo100 = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, workers=4)
modelMinCountTwo100.save('embeddings\description_emb_min2_vs100.bin')

modelMinCountThree100 = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=3, workers=4)
modelMinCountThree100.save('embeddings\description_emb_min3_vs100.bin')

modelMinCountOne300 = gensim.models.Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=1, workers=4)
modelMinCountOne300.save('embeddings\description_emb_min1_vs300.bin')

modelMinCountTwo300 = gensim.models.Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=2, workers=4)
modelMinCountTwo300.save('embeddings\description_emb_min2_vs300.bin')

modelMinCountThree300 = gensim.models.Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=3, workers=4)
modelMinCountThree300.save('embeddings\description_emb_min3_vs300.bin')

In [53]:
model_min2_vs100 = train_bernoulli('embeddings\description_emb_min2_vs100.bin')

Accuracy: 0.6835545297172533


In [54]:
model_min3_vs100 = train_bernoulli('embeddings\description_emb_min3_vs100.bin')

Accuracy: 0.6814772071552222


In [55]:
model_min1_vs300 = train_bernoulli('embeddings\description_emb_min1_vs300.bin')

Accuracy: 0.6795537603385267


In [56]:
model_min2_vs300 = train_bernoulli('embeddings\description_emb_min2_vs300.bin')

Accuracy: 0.6846316599346028


In [57]:
model_min3_vs300 = train_bernoulli('embeddings\description_emb_min3_vs300.bin')

Accuracy: 0.6825158684362377


The best performing word embedding seems to be the one with min_count=2 and vector_size=300. It's accuracy is around 68.5% which still is way worse than the tfidf vectorizer.
It seems like we cannot improve this any further with our own word embedding. Let's test some prebuilt word embeddings then.

### Using the Google News Embedding

In [17]:
# TODO: Continue that later https://machinelearningmastery.com/develop-word-embeddings-python-gensim
# Other Pre Trained Embeddings: https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

#from gensim.models import KeyedVectors

#filename = 'GoogleNews-vectors-negative300.bin'
#model = KeyedVectors.load_word2vec_format(filename, binary=True)

# Classification by Reviewers

## Data Preparation

In [18]:
# find top 2 reviewers
df.taster_name.value_counts()

Roger Voss            25514
Michael Schachner     15134
Kerin O’Keefe         10776
Virginie Boone         9537
Paul Gregutt           9532
Matt Kettmann          6332
Joe Czerwinski         5147
Sean P. Sullivan       4966
Anna Lee C. Iijima     4415
Jim Gordon             4177
Anne Krebiehl MW       3685
Lauren Buzzeo          1835
Susan Kostrzewa        1085
Mike DeSimone           514
Jeff Jenssen            491
Alexander Peartree      415
Carrie Dykes            139
Fiona Adams              27
Christina Pickard         6
Name: taster_name, dtype: int64

In [19]:
filtered_df = df.query('taster_name == "Roger Voss" | taster_name == "Michael Schachner"')

filtered_df['taster_name']

1                Roger Voss
5         Michael Schachner
7                Roger Voss
9                Roger Voss
11               Roger Voss
                ...        
129964           Roger Voss
129965           Roger Voss
129968           Roger Voss
129969           Roger Voss
129970           Roger Voss
Name: taster_name, Length: 40648, dtype: object

## TF/IDF Approach