### Standart imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split

from sklearn.preprocessing import LabelEncoder, RobustScaler

from sklearn.linear_model import Lasso, Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score

In [None]:
# read data
red = pd.read_csv('../input/wine-rating-and-price/Red.csv')
white = pd.read_csv('../input/wine-rating-and-price/White.csv')
sparkling = pd.read_csv('../input/wine-rating-and-price/Sparkling.csv')
rose = pd.read_csv('../input/wine-rating-and-price/Rose.csv')

In [None]:
red.head()

# 1) Data preparation

At modeling step, i would like to work with data as single data frame.<br>
Let's join our data and make feature for wine styles.

In [None]:
red['WineStyle'] = 'red'
white['WineStyle'] = 'white'
sparkling['WineStyle'] = 'sparkling'
rose['WineStyle'] = 'rose'
wines =  pd.concat([red, white, sparkling, rose], ignore_index=True)

In [None]:
wines.info()

Data is quite clear, I only want to convert year feature from object to integer for plotting and modeling

In [None]:
# N.V. wines is a nonvintage wine, which is usually a blend from the produce of two or more years
# we can choose any free number to encode it

wines['Year'] = wines['Year'].replace('N.V.', 2030) # it's important, that there were no 2030 year wines in list before
wines['Year'] = wines['Year'].astype('int')

# 2) EDA

Firstly, will look for general info

In [None]:
wines.sample(frac=1).head()

In [None]:
wines.shape

In [None]:
wines.info()

In [None]:
wines.describe()

In [None]:
wines.Country.nunique()

In [None]:
wines.Country.value_counts()

Often we would like to know main info not about all countries(there are wines from 33 countries, as we can see)<br>
For example, we can analyze countries with the largest export volume

In [None]:
LEV_countries = wines.Country.value_counts()[:12] #Countries with the largest export volume

In [None]:
plt.figure(figsize=(10,4))

country = wines.Country.value_counts()[:12]

graph = sns.countplot(x='Country', 
                  data=wines[wines.Country.isin(LEV_countries.index.values)],
                 color='olive')
graph.set_title("Countries with the largest export volume", fontsize=20)
graph.set_xlabel("Country", fontsize=15)
graph.set_ylabel("Volume", fontsize=15)
graph.set_xticklabels(graph.get_xticklabels(),rotation=45)

plt.show()


### Rating

In [None]:
plt.figure(figsize=(10, 4))
graph = sns.countplot(x='Rating', data=wines, color='mediumpurple')
graph.set_title("Rating Count distribuition ", fontsize=20)
graph.set_xlabel("Rating", fontsize=15) 
graph.set_ylabel("Count", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(16,6))

graph = sns.boxplot(x='Country', y='Rating',
                 data=wines[wines.Country.isin(LEV_countries.index.values)],
                 color='mediumpurple')
graph.set_title("Rating by Country", fontsize=20)
graph.set_xlabel("Country", fontsize=15)
graph.set_ylabel("Rating", fontsize=15)
graph.set_xticklabels(graph.get_xticklabels())

plt.show()

Let's find regions and wineries which produce quite a lot of bottles and have the highest average rating<br>
If you find wine from there in the store it's probably worth a try

In [None]:
MP_regions = wines['Region'].value_counts()[:100].index #most productive regions
print(wines[wines['Region'].isin(MP_regions)].groupby('Region').Rating.mean().sort_values(ascending=False)[:20])
#Regions with the best rating from most productive onece

In [None]:
MP_wineries = wines['Winery'].value_counts()[:100].index #most productive wineries
print(wines[wines['Winery'].isin(MP_wineries)].groupby('Winery').Rating.mean().sort_values(ascending=False)[:20])
#wineries with the best rating from most productive onece

### Price

In [None]:
plt.figure(figsize=(10,10))
plt.subplot(2,1,1)
graph = sns.distplot(wines['Price'], color='coral')
graph.set_title("Price distribuition", fontsize=20) # seting title and size of font
graph.set_xlabel("Price (EUR)", fontsize=15) # seting xlabel and size of font
graph.set_ylabel("Frequency", fontsize=15) # seting ylabel and size of font

plt.subplot(2,1,2)
graph1 = sns.distplot(np.log(wines['Price']) , color='coral')
graph1.set_title("Price Log distribuition", fontsize=20) # seting title and size of font
graph1.set_xlabel("Price(EUR)", fontsize=15) # seting xlabel and size of font
graph1.set_ylabel("Frequency", fontsize=15) # seting ylabel and size of font
graph1.set_xticklabels(np.exp(graph1.get_xticks()).astype(int))

plt.subplots_adjust(hspace = 0.3,top = 0.9)
plt.show()

In [None]:
plt.figure(figsize=(16,18))

plt.subplot(3,1,1)
graph = sns.boxplot(x='Year', y=np.log(wines['Price']),
                    data=wines,
                    color='coral')
graph.set_title("Price by Year", fontsize=20)
graph.set_xlabel("Year", fontsize=15)
graph.set_ylabel("Price(EUR)", fontsize=15)
graph.set_xticklabels(graph.get_xticklabels(),rotation=45)
graph.set_yticklabels(np.exp(graph.get_yticks()).astype(int))

plt.subplot(3,1,2)
graph1 = sns.boxplot(x='WineStyle', y=np.log(wines['Price']),
                 data=wines,
                 color='coral')
graph1.set_title("Price by WineStyle", fontsize=20)
graph1.set_xlabel("WineStyle", fontsize=15)
graph1.set_ylabel("Price(EUR)", fontsize=15)
graph1.set_xticklabels(graph1.get_xticklabels())
graph1.set_yticklabels(np.exp(graph1.get_yticks()).astype(int))

plt.subplot(3,1,3)
graph2 = sns.boxplot(x='Country', y=np.log(wines['Price']),
                 data=wines[wines.Country.isin(LEV_countries.index.values)],
                 color='coral')
graph2.set_title("Price by Country", fontsize=20)
graph2.set_xlabel("Country", fontsize=15)
graph2.set_ylabel("Price(EUR)", fontsize=15)
graph2.set_yticklabels(np.exp(graph2.get_yticks()).astype(int))

plt.subplots_adjust(hspace = 0.3, top = 0.9)

plt.show()

### Other plots

In [None]:
plt.figure(figsize=(16,6))
graph = sns.boxplot(x='Country', y=wines['Rating']/wines['Price'],
                 data=wines[wines.Country.isin(LEV_countries.index.values)],
                 color='olive')
graph.set_title("Rating/Price by Countries", fontsize=20)
graph.set_xlabel("Country", fontsize=15)
graph.set_ylabel("Rating/Price", fontsize=15)
graph.set_xticklabels(graph.get_xticklabels())

plt.show()

In [None]:
plt.figure(figsize=(13,5))

graph = sns.regplot(x=np.log(wines['Price']), y='Rating', 
                    data=wines, fit_reg=False, color='olive')
graph.set_title("Rating x Price Distribuition", fontsize=20)
graph.set_xlabel("Price(EUR)", fontsize= 15)
graph.set_ylabel("Rating", fontsize= 15)
graph.set_xticklabels(np.exp(graph.get_xticks()).astype(int))

plt.show()

Many people think, that good wine must be expensive. Of course, there is a tendency of average rating increasing when the price increases, but there are a lot of wines with 4+ rating just for 7 euros and 4.4+ rating for 20 euro. So, you can find a good wine for any wallet.

In [None]:
corrs = wines[['Rating','NumberOfRatings','Price','Year']].corr() #Heatmap for numetrical columns
fig, ax = plt.subplots(figsize=(7,5))        

sns.heatmap(corrs,annot = True,ax=ax,linewidths=.6, cmap = 'YlGnBu');

In [None]:
plt.figure(figsize=(10,15))

plt.subplot(3,1,1)
graph = sns.distplot(wines['NumberOfRatings'], color='olive')
graph.set_title("Number Of Ratings distribuition", fontsize=20) 
graph.set_xlabel("Number Of Ratings", fontsize=15)
graph.set_ylabel("Frequency", fontsize=15) 

plt.subplot(3,1,2)
graph1 = sns.distplot(np.log(wines['NumberOfRatings']), color='olive')
graph1.set_title("Number Of Ratings Log distribuition", fontsize=20) 
graph1.set_xlabel("Number Of Ratings", fontsize=15) 
graph1.set_ylabel("Frequency", fontsize=15)
graph1.set_xticklabels(np.exp(graph1.get_xticks()).astype(int))

plt.subplot(3,1,3)
graph = sns.distplot(wines[wines['NumberOfRatings']<1000]['NumberOfRatings'], color='olive')
graph.set_title("Number Of Ratings <1000 distribuition", fontsize=20)
graph.set_xlabel("Number Of Ratings", fontsize=15) 
graph.set_ylabel("Frequency", fontsize=15) 

plt.subplots_adjust(hspace = 0.3,top = 0.9)
plt.show()

From the last graphics we can see, that Number of Ratings has the exponential distribution and there a lot of wines,<br> which has low Number of Ratings(at Vivino and in this dataset you can see rating of wines which has more than 25 ratings).<br>
There is a real problem for business, because the main reason of Vivino app is to give info about quality of wine to customers,<br> but for huge number of wines there are no rating at all. <br>
My idea is to apply ML methods to predict rating of wines wich has less than 25 ratings.

# 3) Feature Engineering

### Feature Generation

First glance, there is no information about variety in dataset.<br>
Usually, wine's names also contains variety<br>
I have scraped about 1500 wine varieties from wine-searcher.com and wiki.<br>
Will look for entries of variety from lists in Name column.<br>

In [None]:
varieties = pd.read_csv('../input/wine-rating-and-price/Varieties.csv')

In [None]:
wines['Variety'] = np.nan
for index in wines.index:
    for variety in varieties['Variety']:    
        if variety in wines.loc[index, 'Name']:
            wines.loc[index, 'Variety'] = variety
            break

In [None]:
print('Now we have variety for', wines.Variety.notna().sum(),'wines,',
      '%s%%' % int(wines.Variety.notna().sum()/len(wines)*100), 'of all')

In [None]:
# replace NaN's
wines.Variety = wines.Variety.fillna('unknown')

In [None]:
wines.Variety.value_counts().head(20)

### Encode categorical features

In [None]:
wines_enc = wines.copy().drop(columns = ['Name'])

In [None]:
#One-hot encoder for winestyle
wines_enc = pd.get_dummies(wines_enc, columns = ['WineStyle'])

In [None]:
wines_enc.head()

In [None]:
categorical_cols = [col for col in wines_enc.columns if wines_enc[col].dtype == "object"]

In [None]:
# Apply label encoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    wines_enc[col] = label_encoder.fit_transform(wines_enc[col])

In [None]:
wines_enc.head()

# 4) Modeling

In [None]:
y = wines_enc['Rating']
X = wines_enc.drop(['Rating'], axis = 1)

In [None]:
kfolds = KFold(n_splits=6, shuffle=True,
               random_state=0)

In [None]:
def cv_mae(model, X=X, y=y):
    mae = -cross_val_score(model, X, y,
                          scoring="neg_mean_absolute_error",
                          cv=kfolds)
    return mae

In [None]:
lightgbm = LGBMRegressor(objective='regression',
                         metric='mean_absolute_error',
                         num_leaves=10,
                         learning_rate=0.05,
                         n_estimators=3000,
                         max_depth=5,
                         max_bin=400,
                         bagging_fraction=0.75,                         
                         bagging_freq=5,
                         bagging_seed=7,
                         reg_alpha=0.7,
                         reg_lambda=1.2,
                         feature_fraction=0.6,
                         feature_fraction_seed=7,
                         verbose=-1,
                         min_data_in_leaf=3,
                         min_sum_hessian_in_leaf=11
                         )
xgboost = XGBRegressor(n_estimators=3000,
                       learning_rate=0.02,
                       max_depth=5, 
                       min_child_weight=2,
                       subsample=0.8,
                       colsample_bytree=0.7,
                       nthread=-1,
                       gamma=0,
                       reg_alpha=0.1,
                       reg_lambda=1.8
                       )
catboost = CatBoostRegressor(iterations=3000,
                             learning_rate=0.03,
                             depth=6,
                             l2_leaf_reg = 2,
                             verbose=0
                            )

In [None]:
maes_lgbm = cv_mae(lightgbm)

In [None]:
print('Average lightgbm mae:', np.average(maes_lgbm), ' Standard deviation: ', np.std(maes_lgbm))

In [None]:
maes_xgb = cv_mae(xgboost)

In [None]:
print('Average xgboost mae:', np.average(maes_xgb), ' Standard deviation: ', np.std(maes_xgb))

In [None]:
maes_catboost = cv_mae(catboost)

In [None]:
print('Average catboost mae:', np.average(maes_catboost), ' Standard deviation: ', np.std(maes_catboost))

In [None]:
lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100,150,200,250,300]}
lasso_regressor = GridSearchCV(lasso,parameters,scoring='neg_mean_absolute_error',cv=kfolds)
lasso_regressor.fit(X,y)
print('Best lasso mae:', -lasso_regressor.best_score_,'with',lasso_regressor.best_params_)

In [None]:
ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100,150,200,250,300]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_absolute_error',cv=kfolds)
ridge_regressor.fit(X,y)
print('Best ridge mae:', -ridge_regressor.best_score_,'with',ridge_regressor.best_params_)

We has accuracy = 0.13 with boosting models, and of course it can be improved by bigger dataset size and more features which has Vivino company.

Gradient boosting shows significantly higher result than regresson models. Further, I will use lightgbm, because results are very close, but lightgbm is faster than xgboost and catboost. 

Now let's check our model predictions on wines with low, high and random Number of Ratings.

In [None]:
wines_enc['NumberOfRatings'].sort_values()[int(0.1*(len(wines_enc))):int(0.9*(len(wines_enc)))]

In [None]:
wines_low_NumberOfRatings = wines_enc[wines_enc['NumberOfRatings']<35]
wines_high_NumberOfRatings = wines_enc[wines_enc['NumberOfRatings']>838]
wines_mid_NumberOfRatings = wines_enc[wines_enc['NumberOfRatings']>35][wines_enc['NumberOfRatings']<838]

In [None]:
X_low_NumberOfRatings_test = wines_low_NumberOfRatings.drop(['NumberOfRatings','Rating'], axis = 1)
y_low_NumberOfRatings_test = wines_low_NumberOfRatings['Rating']

X_high_NumberOfRatings_test = wines_high_NumberOfRatings.drop(['NumberOfRatings','Rating'], axis = 1)
y_high_NumberOfRatings_test = wines_high_NumberOfRatings['Rating']

X_mid = wines_mid_NumberOfRatings.drop(['NumberOfRatings','Rating'], axis = 1)
y_mid = wines_mid_NumberOfRatings['Rating']
X_train, X_random_test, y_train, y_random_test = train_test_split(X_mid, y_mid, test_size=len(X_low_NumberOfRatings_test))

In [None]:
print('Train data size:', len(X_train))
print('Test data sizes:', len(X_low_NumberOfRatings_test), len(X_high_NumberOfRatings_test), len(X_random_test))

In [None]:
lgbm = lightgbm.fit(X_train, y_train)

In [None]:
res_low_NumberOfRatings = lgbm.predict(X_low_NumberOfRatings_test)
res_high_NumberOfRatings = lgbm.predict(X_high_NumberOfRatings_test)
res_random_NumberOfRatings = lgbm.predict(X_random_test)

In [None]:
print('MAE of predictions with low NumberOfRatings:   ', mean_absolute_error(y_low_NumberOfRatings_test, res_low_NumberOfRatings))
print('MAE of predictions with high NumberOfRatings:  ', mean_absolute_error(y_high_NumberOfRatings_test, res_high_NumberOfRatings))
print('MAE of predictions with middle NumberOfRatings:', mean_absolute_error(y_random_test, res_random_NumberOfRatings))

As we expected, rating of wines with low Number of Ratings is less representative, and accuracy of model is lower

### If you have any questions or recommendations, please let me know. <br>
## Now I'm looking for a DS job, so if you liked this kernel, please votes up!