In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('coffee_analysis.csv')

In [8]:
df
print(df.head())
print(df.info())

                         name                    roaster         roast  \
0     “Sweety” Espresso Blend                     A.R.C.  Medium-Light   
1        Flora Blend Espresso                     A.R.C.  Medium-Light   
2    Ethiopia Shakiso Mormora               Revel Coffee  Medium-Light   
3          Ethiopia Suke Quto                Roast House  Medium-Light   
4  Ethiopia Gedeb Halo Beriti  Big Creek Coffee Roasters        Medium   

     loc_country        origin_1           origin_2  100g_USD  rating  \
0      Hong Kong          Panama           Ethiopia     14.32      95   
1      Hong Kong          Africa       Asia Pacific      9.05      94   
2  United States       Guji Zone  Southern Ethiopia      4.70      92   
3  United States       Guji Zone      Oromia Region      4.19      92   
4  United States  Gedeb District         Gedeo Zone      4.85      94   

     review_date                                             desc_1  \
0  November 2017  Evaluated as espresso. Swee

In [33]:
# Handle missing values (you might need more sophisticated strategies depending on your data)
df.fillna('Unknown', inplace=True)

# Label encode categorical features
categorical_features = ['name', 'roaster', 'roast', 'loc_country', 'origin_1', 'origin_2', 'desc_1', 'desc_2', 'desc_3', 'review_date']
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le # Store the encoders if you need to reverse transform later

# Select features and target variable
X = df.drop('rating', axis=1)
y = df['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nShape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)
print("Shape of training target:", y_train.shape)
print("Shape of testing target:", y_test.shape)


Shape of training features: (1676, 11)
Shape of testing features: (419, 11)
Shape of training target: (1676,)
Shape of testing target: (419,)


In [34]:
# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=0) # You can tune hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [35]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error on Test Set: {mse:.2f}")
print(f"R-squared Score on Test Set: {r2:.2f}")


Mean Squared Error on Test Set: 1.52
R-squared Score on Test Set: 0.37
