In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
path = "Uganda Coffee_Cupping.xlsx"

# Load Training Data
training_data = pd.read_excel(path, sheet_name="Training Data")

# Load Test Data 1 (Kayunga)
test_data_kayunga = pd.read_excel(path, sheet_name="Test Data 1")

# Load Test Data 2 (Rwenzori)
test_data_rwenzori = pd.read_excel(path, sheet_name="Test Data 2")


In [3]:
# Display first few rows of the training data
print(training_data.head())

# Check the shape of the training data
print(training_data.shape)

# Check for missing values
print(training_data.isnull().sum())

# Get statistical summary
print(training_data.describe())


  ROBUSTA CUP ANNALYSIS RESULTS   26th, Feb, 2018 Unnamed: 1       Unnamed: 2  \
0                                          ORIGIN    VARIETY  FRAGRANCE/AROMA   
1                                          Ibanda        KR3                7   
2                                          Ibanda        KR3                8   
3                                          Ibanda        KR3             8.25   
4                                          Ibanda        KR3              7.5   

  Unnamed: 3  Unnamed: 4     Unnamed: 5  Unnamed: 6  Unnamed: 7 Unnamed: 8  \
0     FLAVOR  SALT/ ACID  BITTER/ SWEET  AFTERTASTE  MOUTH FEEL    BALANCE   
1          6         6.5           6.75         6.5        6.75          8   
2       7.75           7              7        7.25           7          7   
3       7.25           7              7        6.75        6.75          7   
4       7.75         7.5            7.5        7.25        7.75        7.5   

   Unnamed: 9 Unnamed: 10    Unnamed: 11  
0

In [5]:
print(training_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 12 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   ROBUSTA CUP ANNALYSIS RESULTS   26th, Feb, 2018  76 non-null     object
 1   Unnamed: 1                                       76 non-null     object
 2   Unnamed: 2                                       76 non-null     object
 3   Unnamed: 3                                       76 non-null     object
 4   Unnamed: 4                                       76 non-null     object
 5   Unnamed: 5                                       76 non-null     object
 6   Unnamed: 6                                       76 non-null     object
 7   Unnamed: 7                                       76 non-null     object
 8   Unnamed: 8                                       76 non-null     object
 9   Unnamed: 9                                   

In [6]:
print(test_data_kayunga.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 12 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   ROBUSTA CUP ANNALYSIS RESULTS   22nd, April, 2023  26 non-null     object
 1   Unnamed: 1                                         26 non-null     object
 2   Unnamed: 2                                         26 non-null     object
 3   Unnamed: 3                                         26 non-null     object
 4   Unnamed: 4                                         26 non-null     object
 5   Unnamed: 5                                         26 non-null     object
 6   Unnamed: 6                                         26 non-null     object
 7   Unnamed: 7                                         26 non-null     object
 8   Unnamed: 8                                         26 non-null     object
 9   Unnamed: 9             

In [7]:
print(test_data_rwenzori.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 12 columns):
 #   Column                                          Non-Null Count  Dtype 
---  ------                                          --------------  ----- 
 0   ROBUSTA CUP ANNALYSIS RESULTS   1st, May, 2023  26 non-null     object
 1   Unnamed: 1                                      26 non-null     object
 2   Unnamed: 2                                      26 non-null     object
 3   Unnamed: 3                                      26 non-null     object
 4   Unnamed: 4                                      26 non-null     object
 5   Unnamed: 5                                      26 non-null     object
 6   Unnamed: 6                                      26 non-null     object
 7   Unnamed: 7                                      26 non-null     object
 8   Unnamed: 8                                      26 non-null     object
 9   Unnamed: 9                                      26 non-n

In [8]:
# Check for missing values in the training data
print(training_data.isnull().sum())

# Check for missing values in the test data for Kayunga
print(test_data_kayunga.isnull().sum())

# Check for missing values in the test data for Rwenzori
print(test_data_rwenzori.isnull().sum())


ROBUSTA CUP ANNALYSIS RESULTS   26th, Feb, 2018    0
Unnamed: 1                                         0
Unnamed: 2                                         0
Unnamed: 3                                         0
Unnamed: 4                                         0
Unnamed: 5                                         0
Unnamed: 6                                         0
Unnamed: 7                                         0
Unnamed: 8                                         0
Unnamed: 9                                         0
Unnamed: 10                                        0
Unnamed: 11                                        0
dtype: int64
ROBUSTA CUP ANNALYSIS RESULTS   22nd, April, 2023     0
Unnamed: 1                                            0
Unnamed: 2                                            0
Unnamed: 3                                            0
Unnamed: 4                                            0
Unnamed: 5                                            0
Unnamed: 6     

In [9]:
# Drop rows with missing values in the training data
training_data = training_data.dropna()

# Drop rows with missing values in the test data for Kayunga
test_data_kayunga = test_data_kayunga.dropna()

# Drop rows with missing values in the test data for Rwenzori
test_data_rwenzori = test_data_rwenzori.dropna()


In [13]:
test_data_kayunga = test_data_kayunga.dropna()
print("Number of duplicate rows in test_data_kayunga:", duplicates.sum())

Number of duplicate rows in test_data_kayunga: 0


In [14]:
ttest_data_rwenzori = test_data_rwenzori.dropna()
print("Number of duplicate rows in test_data_rwenzori:", duplicates.sum())

Number of duplicate rows in test_data_rwenzori: 0


In [11]:
# Check for duplicate rows in the training data
duplicates = training_data.duplicated()
print("Number of duplicate rows in training data:", duplicates.sum())


Number of duplicate rows in training data: 0


In [15]:
import matplotlib.pyplot as plt

# Histogram of overall scores in the training data
plt.hist(training_data["OVERALL SCORE"], bins=10)
plt.xlabel("Overall Score")
plt.ylabel("Frequency")
plt.title("Distribution of Overall Scores")
plt.show()


KeyError: 'OVERALL SCORE'

In [16]:
import seaborn as sns

# Pairwise scatter plot of selected variables in the training data
selected_vars = ["FRAGRANCE/AROMA", "FLAVOR", "AFTERTASTE", "OVERALL SCORE"]
sns.pairplot(training_data[selected_vars])
plt.show()


KeyError: "None of [Index(['FRAGRANCE/AROMA', 'FLAVOR', 'AFTERTASTE', 'OVERALL SCORE'], dtype='object')] are in the [columns]"

In [17]:
# Box plot of the overall scores in the training data
sns.boxplot(x=training_data["OVERALL SCORE"])
plt.xlabel("Overall Score")
plt.title("Box Plot of Overall Scores")
plt.show()


KeyError: 'OVERALL SCORE'