In [187]:
import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import xgboost as xgb
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import  pyLDAvis.sklearn

from tqdm import tqdm
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string

# !python3 -m spacy download en_core_web_lg --user

In [2]:
nlp = spacy.load('en_core_web_lg')

In [18]:
data = pd.read_csv("./winemag-data_first150k.csv")

data = data[data.duplicated('description', keep=False)]
print("Length of dataframe after duplicates are removed:", len(data))

data.dropna(subset=['description', 'points'])
print("Length of dataframe after NaNs are removed:", len(data))

Length of dataframe after duplicates are removed: 92393
Length of dataframe after NaNs are removed: 92393


In [136]:
labels = data['points']
descriptions = data['description']

In [20]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [21]:
# Parser for reviews
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [22]:
tqdm.pandas()
data["processed_description"] = data["description"].progress_apply(spacy_tokenizer)

100%|██████████| 92393/92393 [02:07<00:00, 722.60it/s]


In [23]:
vectorizer = TfidfVectorizer()
data_vectorized = vectorizer.fit_transform(data["processed_description"])

In [37]:
lda = LatentDirichletAllocation(n_components=10, doc_topic_prior=.01, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [44]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [92]:
X = pd.DataFrame(data_lda)

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

------------------------------------------------------------------------------------

In [184]:
parameters = {'n_estimators': [500, 700, 1000], 'max_depth': [None, 10, 100]}

rf_cv = GridSearchCV(RandomForestRegressor(), parameters, cv=10, scoring="r2",
                   n_jobs=8, verbose=1)
rf_cv.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [186]:
pred = rfc.predict(X_test)
mean_squared_error(y_test, pred)

1.3122903462459368

--------------------------------------------------------------------------------------------------

In [146]:
parameters = {'max_depth': [None, 10, 100], 'n_estimators': [500, 700, 1000]}

xgb_cv = GridSearchCV(xgb.XGBRegressor(), parameters, cv=10, scoring="r2", 
                   n_jobs=8, verbose=1)
xgb_cv.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=100, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

---------------------------------------------------------------------------------

In [165]:
parameters = {alpha=[1, 0.1, 0.01, 0.001]}
ridge_cv = GridSearchCV(Ridge(), parameters, cv=10, scoring="r2", 
                   n_jobs=8, verbose=1)
ridge_cv.fit(X_train, y_train)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [166]:
from sklearn.metrics import mean_squared_error

In [167]:
mean_squared_error(y_test, lr.predict(X_test))

8.482839988033524