## The following notebook consists of a predictive model for predicting the wine "variety" after analyzing the review description

### Importing all the required libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import MultinomialNB

### Reading the train and test dataset

In [2]:
df = pd.read_csv('train.csv')

In [32]:
df_test = pd.read_csv('test.csv')

In [3]:
#Examining the input data dimensions
df.shape

(82657, 12)

In [33]:
#examining the test data shape
df_test.shape

(20665, 11)

In [5]:
df_test.head(5)

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery
0,@paulgwine,US,Boedecker Cellars 2011 Athena Pinot Noir (Will...,Nicely differentiated from the companion Stewa...,Athena,88,35.0,Oregon,Willamette Valley,Willamette Valley,Boedecker Cellars
1,@wineschach,Argentina,Mendoza Vineyards 2012 Gran Reserva by Richard...,"Charred, smoky, herbal aromas of blackberry tr...",Gran Reserva by Richard Bonvin,90,60.0,Mendoza Province,Mendoza,,Mendoza Vineyards
2,@vboone,US,Prime 2013 Chardonnay (Coombsville),"Slightly sour and funky in earth, this is a re...",,87,38.0,California,Coombsville,Napa,Prime
3,@wineschach,Argentina,Bodega Cuarto Dominio 2012 Chento Vineyard Sel...,"This concentrated, midnight-black Malbec deliv...",Chento Vineyard Selection,91,20.0,Mendoza Province,Mendoza,,Bodega Cuarto Dominio
4,@kerinokeefe,Italy,SassodiSole 2012 Brunello di Montalcino,"Earthy aromas suggesting grilled porcini, leat...",,90,49.0,Tuscany,Brunello di Montalcino,,SassodiSole


In [6]:
df.head(5)

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety
0,,Australia,Andrew Peace 2007 Peace Family Vineyard Chardo...,"Classic Chardonnay aromas of apple, pear and h...",Peace Family Vineyard,83,10.0,Australia Other,South Eastern Australia,,Andrew Peace,Chardonnay
1,@wawinereport,US,North by Northwest 2014 Red (Columbia Valley (...,This wine is near equal parts Syrah and Merlot...,,89,15.0,Washington,Columbia Valley (WA),Columbia Valley,North by Northwest,Red Blend
2,,Italy,Renato Ratti 2007 Conca (Barolo),Barolo Conca opens with inky dark concentratio...,Conca,94,80.0,Piedmont,Barolo,,Renato Ratti,Nebbiolo
3,@vossroger,France,Domaine l'Ancienne Cure 2010 L'Abbaye White (B...,It's impressive what a small addition of Sauvi...,L'Abbaye,87,22.0,Southwest France,Bergerac Sec,,Domaine l'Ancienne Cure,Bordeaux-style White Blend
4,@vossroger,France,Château du Cèdre 2012 Le Cèdre Vintage Malbec ...,"This ripe, sweet wine is rich and full of drie...",Le Cèdre Vintage,88,33.0,France Other,Vin de Liqueur,,Château du Cèdre,Malbec


### Removing the null entries for accurate prediction

In [26]:
df = df[pd.notnull(df.price)]
df = df[pd.notnull(df.variety)]
df = df[pd.notnull(df.points)]

In [35]:
df_test = df_test[pd.notnull(df_test.price)]
df_test = df_test[pd.notnull(df_test.points)]
df_test = df_test[pd.notnull(df_test.winery)]
df_test = df_test[pd.notnull(df_test.province)]

### Displaying the new shapes of the input data after removing null entries

In [37]:
print(df.shape)

(77057, 12)


In [36]:
print(df_test.shape)

(19267, 11)


### Storing the province and winery column into 2 seperate columns

In [8]:
df = df[pd.notnull(df.province)]
output_data = df['province']

In [9]:
df = df[pd.notnull(df.winery)]
new_winery = df['winery']

### Label encoding the winery and province column for input into Prediction model

In [11]:
labelEncoder = LabelEncoder()
df['winery'] = labelEncoder.fit_transform(new_winery)

In [12]:
df['province'] = labelEncoder.fit_transform(output_data)

### Creating x and y columns from the training dataset for selecting the best model based on training accuracy

In [28]:
x = df.drop(['user_name','country','designation','review_title','region_1','region_2','variety', 'winery', 'province'], axis = 1)
y = df.variety

### Splitting the train dataset further into train and test for model selection

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state = 1)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(57792, 3) (19265, 3) (57792,) (19265,)


### Creating the xtest column for final prediction after model selection

In [30]:
xtest = df_test.drop(['user_name','country','designation','review_title','region_1','region_2', 'winery', 'province'], axis = 1)

In [31]:
print(xtest.shape)

(19267, 3)


In [16]:
print(X_train.columns.values)
print(X_test.columns.values)
print(xtest.columns.values)

['review_description' 'points' 'price']
['review_description' 'points' 'price']
['review_description' 'points' 'price']


### Since wine variety names were also present in the description, hence for accurate prediction and realistic model creation, we add those words into our stop-words list.

In [17]:
wine =df.variety.unique().tolist()
wine.sort()
wine[:10]

['Bordeaux-style Red Blend',
 'Bordeaux-style White Blend',
 'Cabernet Franc',
 'Cabernet Sauvignon',
 'Champagne Blend',
 'Chardonnay',
 'Gamay',
 'Gewürztraminer',
 'Grüner Veltliner',
 'Malbec']

### Creating a variety-list which would be appended to our stop-word list.

In [18]:
output = set()
for x in df.variety:
    x = x.lower()
    x = x.split()
    for y in x:
        output.add(y)

variety_list =sorted(output)
variety_list[:10]

['blanc',
 'blend',
 'bordeaux-style',
 'cabernet',
 'champagne',
 'chardonnay',
 'franc',
 'gamay',
 'gewürztraminer',
 'grigio']

### Removing any further useless characters from the description

In [19]:
extras = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'cab',"%"]
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop.update(variety_list)
stop.update(extras)

### Finally creating a sparse matrix of train, test and final test dataset for input into our model algorithms

In [20]:
from scipy.sparse import hstack

vect = CountVectorizer(stop_words = stop)
X_train_dtm = vect.fit_transform(X_train.review_description)
price = X_train.price.values[:,None]
points = X_train.points.values[:,None]
X_train_dtm = hstack((X_train_dtm, price, points))
X_train_dtm

  'stop_words.' % sorted(inconsistent))


<57792x22608 sparse matrix of type '<class 'numpy.float64'>'
	with 1501568 stored elements in COOrdinate format>

In [21]:
X_test_dtm = vect.transform(X_test.review_description)
price_test = X_test.price.values[:,None]
point_test = X_test.points.values[:,None]
X_test_dtm = hstack((X_test_dtm, price_test, point_test))
X_test_dtm

<19265x22608 sparse matrix of type '<class 'numpy.float64'>'
	with 498206 stored elements in COOrdinate format>

In [40]:
x_test_final = vect.transform(xtest.review_description)
price_test_final = xtest.price.values[:,None]
point_test_final = xtest.points.values[:,None]
x_test_final = hstack((x_test_final, price_test_final, point_test_final))
x_test_final

<19267x22608 sparse matrix of type '<class 'numpy.float64'>'
	with 498054 stored elements in COOrdinate format>

### First, I chose Multinomial Naive Bayes Algorithm for building the model

In [57]:
models = {}
for z in wine:
    model = MultinomialNB()
    y = Y_train == z
    model.fit(X_train_dtm, y)
    models[z] = model
    
testing_probs = pd.DataFrame(columns = wine)

### But I recieved quite poor accuracy on training data, only 52%, hence I did not select it for final prediction.

In [59]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':Y_test.values, 'predicted':predicted_wine.values})   

from sklearn.metrics import accuracy_score
print('Accuracy Score:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
comparison.head()

Accuracy Score: 51.7259278484298 %


Unnamed: 0,actual,predicted
0,Red Blend,Red Blend
1,Merlot,Cabernet Sauvignon
2,Pinot Noir,Pinot Noir
3,Rhône-style Red Blend,Red Blend
4,Red Blend,Red Blend


### Next, I chose Logistic Regression for making the predictions

In [48]:
from sklearn.linear_model import LogisticRegression
models = {}
for z in wine:
    model = LogisticRegression()
    y = Y_train == z
    model.fit(X_train_dtm, y)
    models[z] = model

testing_probs = pd.DataFrame(columns = wine)

### Applying logistic regression, I recieved quite improved accuracy of almost 62.5% on training data

In [47]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':Y_test.values, 'predicted':predicted_wine.values})   

from sklearn.metrics import accuracy_score
print('Accuracy Score:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
comparison.head()

Accuracy Score: 62.48637425382818 %


Unnamed: 0,actual,predicted
0,Red Blend,Cabernet Sauvignon
1,Merlot,Merlot
2,Pinot Noir,Pinot Noir
3,Rhône-style Red Blend,Rhône-style Red Blend
4,Red Blend,Red Blend


### To be on the safer side, I also tried the predicition using Neural Networks which is shown in the attached notebook titled: "KNIGHT_ML_ASSIGNMENT_1.1"

### Using neural networks too, I achieved an accuracy of almost 61% on training data

### Hence, finally I made the predictions using Logistic Regression Model since it gave highest accuracy out of all 33 models on training data.

In [50]:
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(x_test_final)[:,1]
    
predicted_wine = testing_probs.idxmax(axis=1)

#comparison = pd.DataFrame({'actual':Y_test.values, 'predicted':predicted_wine.values})   

#from sklearn.metrics import accuracy_score
#print('Accuracy Score:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
#comparison.head()

In [54]:
predicted_wine.shape

(19267,)

### The final predictions are saved in the csv file titled "prediction.csv" which can be found attached in the given folder.

In [56]:
prediction = pd.DataFrame(predicted_wine).to_csv('prediction.csv', index = None)