In [75]:
import numpy as np
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 ## Getting the train and test data

In [None]:
train_data = pd.read_csv("train.csv")

In [None]:
test_data = pd.read_csv('test.csv')

## Preparing the data

In [78]:
train_data.head()

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety
0,,Australia,Andrew Peace 2007 Peace Family Vineyard Chardo...,"Classic Chardonnay aromas of apple, pear and h...",Peace Family Vineyard,83,10.0,Australia Other,South Eastern Australia,,Andrew Peace,Chardonnay
1,@wawinereport,US,North by Northwest 2014 Red (Columbia Valley (...,This wine is near equal parts Syrah and Merlot...,,89,15.0,Washington,Columbia Valley (WA),Columbia Valley,North by Northwest,Red Blend
2,,Italy,Renato Ratti 2007 Conca (Barolo),Barolo Conca opens with inky dark concentratio...,Conca,94,80.0,Piedmont,Barolo,,Renato Ratti,Nebbiolo
3,@vossroger,France,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,It's impressive what a small addition of Sauvi...,L'Abbaye,87,22.0,Southwest France,Bergerac Sec,,Domaine l'Ancienne Cure,Bordeaux-style White Blend
4,@vossroger,France,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,"This ripe, sweet wine is rich and full of drie...",Le Cèdre Vintage,88,33.0,France Other,Vin de Liqueur,,Château du Cèdre,Malbec


In [None]:
train_data = train_data.dropna(axis=0)
train_data = train_data.reset_index(drop=True)

In [None]:
train_data = train_data[['review_description', 'variety']]


In [None]:
X = train_data['review_description']
y = train_data['variety']

In [None]:
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(y)

In [None]:
X = X.str.lower()

In [None]:
X_mod = []
for phase_word in X:
    X_mod.append(" ".join([re.sub('[0-9\W_]', '', word) for word in phase_word.split() if not word in sw]))

In [None]:
test_data = test_data['review_description']

In [None]:
test_data = test_data.str.lower()
test_data_mod = []
for phase_word in test_data:
    test_data_mod.append(" ".join([re.sub('[0-9\W_]', '', word) for word in phase_word.split() if not word in sw]))

In [None]:
X_mod = X_mod + test_data_mod

## Text to vector using countvectorizer

In [None]:
countVectorizer = CountVectorizer()
X_mod = countVectorizer.fit_transform(X_mod).todense()

In [None]:
X_mod_train, X_mod_test = X_mod[:15986, :], X_mod[15986:, :]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_mod_train, y, test_size=0.2) 

## Training the model

In [91]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=len(countVectorizer.get_feature_names())))
model.add(Dense(units=y.max()+1, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist = model.fit(X_train, y_train, epochs=2, validation_data=(X_val, y_val))

Epoch 1/2
Epoch 2/2


In [103]:
model.evaluate(X_train, y_train)



[0.3825018107891083, 0.9116359353065491]

91 percent training accuracy

In [92]:
scores = model.evaluate(X_val, y_val, verbose=1)
print ('The accuracy of the model is %s' % scores[1])

The accuracy of the model is 0.7163852453231812


 approx 72 percent validation accuracy

In [93]:
labelEncoder.classes_

array(['Bordeaux-style Red Blend', 'Bordeaux-style White Blend',
       'Cabernet Franc', 'Cabernet Sauvignon', 'Champagne Blend',
       'Chardonnay', 'Gamay', 'Gewürztraminer', 'Grüner Veltliner',
       'Malbec', 'Merlot', 'Nebbiolo', 'Pinot Grigio', 'Pinot Gris',
       'Pinot Noir', 'Red Blend', 'Rhône-style Red Blend', 'Riesling',
       'Rosé', 'Sangiovese', 'Sauvignon Blanc', 'Sparkling Blend',
       'Syrah', 'Tempranillo', 'White Blend', 'Zinfandel'], dtype=object)

## Predictions

In [None]:
predictions = model.predict(X_mod_test)

In [95]:
len(predictions)

20665

In [None]:
predicted_variety = []
for prediction in predictions:
  predicted_variety.append(labelEncoder.classes_[prediction.argmax()])

## Creating the csv file

In [None]:
df = pd.read_csv('test.csv')

In [None]:
df['predicted_variety'] = predicted_variety

In [None]:
df.to_csv('predictions.csv')