In [361]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
from wordcloud import WordCloud, STOPWORDS
import re
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor, cv
def truncate(f, n):
    return math.floor(f * 10 ** n) / 10 ** n

In [362]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [363]:
print("Total number of examples: ", data.shape[0])
print("Number of examples with the same title and description: ",
      data[data.duplicated(['description','title'])].shape[0])

# Removing duplicate items
data=data.drop_duplicates(['description','title'])
data=data.reset_index(drop=True)

Total number of examples:  129971
Number of examples with the same title and description:  9983


In [364]:
# Dropping records with no price value as we don't want to include them in our model and dropping them won't affect the performance of our model
data=data.dropna(subset=['price'])
data=data.reset_index(drop=True)

In [None]:
# Substituting the NA values in other columns to UNKNOWN to that we can generate the one-hot-encoding correct
data=data.fillna("UNKNOWN")

In [365]:
# Setting the description to the lower case and removing non-alphabetic characters
data['description']= data['description'].str.lower()
data['description']= data['description'].apply(lambda elem: re.sub('[^a-zA-Z]',' ', elem))  

In [366]:
# Tokenize the description string, so that we can generate features using TF-IDF/Count vectorizer for regression
tokenizer = RegexpTokenizer(r'\w+')
words_descriptions = data['description'].apply(tokenizer.tokenize)
words_descriptions.head()
all_words = [word for tokens in words_descriptions for word in tokens]
data['description_lengths']= [len(tokens) for tokens in words_descriptions]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))

4624968 words total, with a vocabulary size of 29486


In [367]:
# Removing stop words and stemming
stopword_list = stopwords.words('english')
ps = PorterStemmer()
words_descriptions = words_descriptions.apply(lambda elem: [word for word in elem if not word in stopword_list])
words_descriptions = words_descriptions.apply(lambda elem: [ps.stem(word) for word in elem])
data['description_cleaned'] = words_descriptions.apply(lambda elem: ' '.join(elem))

In [368]:
# Top 100 common words (after cleaning)
all_words = [word for tokens in words_descriptions for word in tokens]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
count_all_words = Counter(all_words)
count_all_words.most_common(100)

2822364 words total, with a vocabulary size of 21073


[('wine', 69125),
 ('flavor', 62686),
 ('fruit', 53836),
 ('finish', 35863),
 ('aroma', 35564),
 ('palat', 33674),
 ('acid', 33330),
 ('cherri', 29505),
 ('drink', 28905),
 ('tannin', 27717),
 ('black', 24963),
 ('ripe', 24037),
 ('dri', 22844),
 ('note', 21892),
 ('spice', 20040),
 ('red', 18821),
 ('rich', 18382),
 ('fresh', 18095),
 ('berri', 16569),
 ('oak', 16557),
 ('show', 15940),
 ('nose', 14976),
 ('plum', 14252),
 ('sweet', 13919),
 ('full', 13729),
 ('offer', 13698),
 ('blackberri', 13395),
 ('textur', 13370),
 ('blend', 13280),
 ('appl', 13155),
 ('balanc', 13005),
 ('bodi', 13003),
 ('soft', 12045),
 ('age', 11719),
 ('crisp', 11409),
 ('well', 11328),
 ('white', 11150),
 ('light', 11149),
 ('dark', 10653),
 ('structur', 10643),
 ('citru', 10109),
 ('raspberri', 9909),
 ('cabernet', 9858),
 ('vanilla', 9829),
 ('hint', 9750),
 ('herb', 9717),
 ('miner', 9669),
 ('fruiti', 9653),
 ('bright', 9380),
 ('give', 9222),
 ('pepper', 9131),
 ('touch', 8885),
 ('lemon', 8666),
 ('y

In [370]:
def prepare_dataframe(vect, data, features=True):
    vectorized=vect.fit_transform(data['description_cleaned']).toarray()
    vectorized=pd.DataFrame(vectorized)
    vectorized.columns = [ name + "_nlp" for name in  vect.get_feature_names()]
    if features == True:
        X=data.drop(columns=['points','Unnamed: 0','description','description_cleaned'])
        X=X.fillna(-1)
        print(X.columns)
        X=pd.concat([X.reset_index(drop=True),vectorized.reset_index(drop=True)],axis=1)
        categorical_features_indices =[0,1,3,4,5,6,7,8,9,10]
    else:
        X=vectorized
        categorical_features_indices =[]
    y=data['points']
    return X,y,categorical_features_indices

## Hyperparameter Tuning for Catboostregressor using the built in grid_search -- It takes a long time (2-3hrs) to run

In [None]:
# I have commented out this code, as this tuning current runs for 2hrs, but this can be uncommented later, if needed to run
# def hyperparamtuning(X_train, Y_train, categorical_features_indices):
#     model = CatBoostRegressor(cat_features=categorical_features_indices, loss_function = 'RMSE')
#     grid = {'learning_rate': [0.03, 0.02, 0.01],'depth': [4, 6, 10],'l2_leaf_reg': [1, 3, 5,]}
#     search_results = model.grid_search(grid,X=X_train,y=Y_train,plot=True)
#     return search_results

# X, y , categorical_features_indices = prepare_dataframe(vect, data)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
# hyperparamtuning_res = hyperparamtuning(X_train, y_train, categorical_features_indices)

In [None]:
print("Optimal parameters from hyperparam grid search:", hyperparamtuning_res['params'])

In [371]:
def perform_model(X_train, y_train,X_valid, y_valid,X_test, y_test,categorical_features_indices,name="TFIDF-based", depth=6, learning_rate=0.03, l2_leaf_reg=1,
                 optimal_param=True):
    model = CatBoostRegressor(
        random_seed = 100,
        loss_function = 'RMSE',
        iterations=800,
        depth=depth,
        learning_rate=learning_rate,
        l2_leaf_reg = l2_leaf_reg
     )
    
    if not optimal_param:
        model = CatBoostRegressor(
        random_seed = 100,
        loss_function = 'RMSE',
        iterations=800,
     )
    model.fit(
        X_train, y_train,
        cat_features = categorical_features_indices,
        verbose=False,
        eval_set=(X_valid, y_valid)
    )
    
#     print(name+" technique RMSE on training data: "+ model.score(X_train, y_train).astype(str))
#     print(name+" technique RMSE on test data: "+ model.score(X_test, y_test).astype(str))
    return model
    
def prepare_variable(vect, data, features_append=True):
    X, y , categorical_features_indices = prepare_dataframe(vect, data,features_append)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, 
                                                        random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, 
                                                        random_state=52)
    return X_train, y_train,X_valid, y_valid,X_test, y_test, categorical_features_indices




## Uncomment this to plot the test-RMSE-mean vs iteration for hyper parameter grid search.

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline
# sns.set_style("dark")
# plt.style.use(['seaborn-darkgrid'])
# plt.plot(hyperparamtuning_res['cv_results']['iterations'], np.log(hyperparamtuning_res['cv_results']['test-RMSE-mean']))
# plt.title("Test-RMSE-MEAN over iterations")
# plt.xlabel("Iterations")
# plt.ylabel("log(test-RMSE-mean)")
# plt.show()

In [372]:
vect= TfidfVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data)
tfidf_model = perform_model(*training_variable, 'TF-IDF')

Index(['country', 'designation', 'price', 'province', 'region_1', 'region_2',
       'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery',
       'description_lengths'],
      dtype='object')


In [373]:
x_test = training_variable[4]
y_test = training_variable[5]
pred = tfidf_model.predict(x_test)
print(truncate(math.sqrt(mean_squared_error(pred, y_test)), 4))

1.5594


In [374]:
pred_diff = tfidf_model.get_feature_importance(prettified=True)
pred_diff[0:20]
tfidf_model.get_feature_importance(prettified=True)[0:11]

Unnamed: 0,Feature Id,Importances
0,price,22.713024
1,description_lengths,19.624077
2,winery,7.864938
3,taster_name,6.812793
4,taster_twitter_handle,2.923642
5,rich_nlp,1.529884
6,region_1,1.295969
7,beauti_nlp,1.208322
8,complex_nlp,1.036663
9,lack_nlp,0.996292


In [375]:
tfidf_model.save_model('tfidf_wine_points_prediction.model')

In [376]:
vect= TfidfVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data)
tfidf_model = perform_model(*training_variable, 'TF-IDF', optimal_param=False)

Index(['country', 'designation', 'price', 'province', 'region_1', 'region_2',
       'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery',
       'description_lengths'],
      dtype='object')


In [377]:
x_test = training_variable[4]
y_test = training_variable[5]
pred = tfidf_model.predict(x_test)
print(truncate(math.sqrt(mean_squared_error(pred, y_test)), 4))

1.5595
