# Read model data with pandas

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_model.csv')

In [3]:
df.head(2)

Unnamed: 0,Ranking,Rating,Number of Reviews,branches,n_styles,negative,neutral,positive,high,low,...,Paris,Prague,Rome,Stockholm,Vienna,Warsaw,Zurich,Ranking_squared,NoR_squared,Rank_NoR_multi
0,5570.0,3.5,194.0,0,3,0,1,0,0,0,...,1,0,0,0,0,0,0,31024900.0,37636.0,1080580.0
1,1537.0,4.0,10.0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,2362369.0,100.0,15370.0


# Split the dataframe into parts necessary for training and testing the model

In [4]:
# X - data with information about restaurants, y - target variable (restaurant ratings)

X = df.drop(['Rating'], axis=1)
y = df['Rating']

In [5]:
# import tool for splitting:
from sklearn.model_selection import train_test_split

In [6]:
# Datasets labeled "train" will be used for training the model, "test" for testing.
# For testing we will use 25% of the original dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Create, train and test the model

In [7]:
# Import necessary libraries:
# tool for creating and training a model
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics  # tools for assessing model accuracy

In [8]:
# Create model
regr = RandomForestRegressor(n_estimators=100)

# Train the model on a test dataset
regr.fit(X_train, y_train)

# We use a trained model to predict restaurant ratings in a test sample.
# Predicted values are written to the y_pred variable
y_pred = regr.predict(X_test)

In [9]:
# Compare the predicted values (y_pred) with the real ones (y_test), and see how much they differ on average
# The metric is called the Mean Absolute Error (MAE) and shows the average deviation of the predicted values from the actual ones.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.15411599999999998
