In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.metrics import make_scorer
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split

## cross validation
from  sklearn.linear_model import LassoCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

from xgboost import XGBClassifier
from sklearn.svm import SVR

from scipy import sparse as sp
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go


from sklearn.linear_model import LinearRegression

In [2]:
data_path = "../data/sample.csv"
data = pd.read_csv(data_path)

In [3]:
features = ["country","description","price","province","region_1","variety","winery"]
target= 'points'

cols = ["country","description","price","province","region_1","variety","winery", "points"]

data_ = data[cols]
data_ = data_.dropna()

X = data_[features]
y = data_[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)

In [5]:
# process text data

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

description_train = vectorizer.fit_transform(X_train["description"])
description_test = vectorizer.transform(X_test["description"]) 

In [6]:
# one hot encoded columns from categorical features

countrycols = pd.get_dummies(X_train['country'])
provincecols = pd.get_dummies(X_train['province'])
regioncols = pd.get_dummies(X_train['region_1'])
varietycols =  pd.get_dummies(X_train['variety'])
winerycols = pd.get_dummies(X_train['winery'])



countrycols_test = pd.get_dummies(X_test['country'])
provincecols_test = pd.get_dummies(X_test['province'])
regioncols_test = pd.get_dummies(X_test['region_1'])
varietycols_test =  pd.get_dummies(X_test['variety'])
winerycols_test = pd.get_dummies(X_test['winery'])


In [7]:
#scaling numeric features
scaler = StandardScaler(copy=False)
price = pd.DataFrame(scaler.fit_transform(X_train[["price"]]), columns=["price"])
price_test = pd.DataFrame(scaler.transform(X_test[["price"]]), columns=["price"])

In [8]:
# joining categorical and numeric features
X_train = pd.concat([price, countrycols, provincecols, regioncols, varietycols, winerycols], axis=1)
X_test = pd.concat([price_test, countrycols_test, provincecols_test, regioncols_test, varietycols_test, winerycols_test], axis=1)

In [9]:
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_test = X_test.loc[:,~X_test.columns.duplicated()]

In [10]:
features_to_train = list(set(X_train.columns))
features_not_in_test = list(set(features_to_train) - set(X_test.columns))
for feature in features_not_in_test:
    X_test[feature] = 0

In [11]:
X_test = X_test[features_to_train]

In [12]:
X_train = sp.csr_matrix(X_train.values)
X_test = sp.csr_matrix(X_test.values)

In [13]:
print(X_train.shape)
print(X_test.shape)

(338, 524)
(85, 524)


In [14]:
X_train = sp.hstack([X_train, description_train])
X_test = sp.hstack([X_test, description_test])

In [18]:
param_dist = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf','poly']
             }
                                  
svr_cv_clf = GridSearchCV(SVR(), 
                         param_grid = param_dist, 
                         scoring = 'neg_mean_squared_error',
                         cv = 10,
                         n_jobs = -1)

svr_cv_clf.fit(X_train, y_train)


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.





GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [16]:
svr_cv_clf

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'c': [0.1, 1, 10], 'kernel': ['rbf', 'poly']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)