In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import numpy as np


wine_reviews = pd.read_csv('/Users/sabuhiaghayev/Downloads/winemag-data-130k-v2.csv')

sample_fraction = 0.1
train_df = wine_reviews.sample(frac=sample_fraction, random_state=42)
test_df = wine_reviews.drop(train_df.index)

train_df = train_df.dropna(subset=['points', 'price', 'description', 'variety'])
test_df = test_df.dropna(subset=['points', 'price', 'description', 'variety'])


numerical_features = ['price']
categorical_features = ['variety']
text_features = ['description']

scaler = StandardScaler()
X_train_num = scaler.fit_transform(train_df[numerical_features])
X_test_num = scaler.transform(test_df[numerical_features])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat = encoder.fit_transform(train_df[categorical_features])
X_test_cat = encoder.transform(test_df[categorical_features])

tfidf = TfidfVectorizer(stop_words='english', max_features=500)  
X_train_text = tfidf.fit_transform(train_df[text_features[0]]).toarray()
X_test_text = tfidf.transform(test_df[text_features[0]]).toarray()


X_train_combined = np.hstack((X_train_num, X_train_cat, X_train_text))
X_test_combined = np.hstack((X_test_num, X_test_cat, X_test_text))

y_train = train_df['points'].values
y_test = test_df['points'].values

best_params = {'C': 10, 'epsilon': 0.2, 'gamma': 'scale'}

best_svr = SVR(C=best_params['C'], epsilon=best_params['epsilon'], gamma=best_params['gamma'])
best_svr.fit(X_train_combined, y_train)

y_pred = best_svr.predict(X_test_combined)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 3.2337963937665637
