In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn import metrics

plt.rcParams['figure.figsize'] = [12, 7]
sns.set(rc={'figure.figsize':(7,8)})

In [None]:
data = pd.read_csv("../input/coffee-quality-database-from-cqi/arabica_data_cleaned.csv")
data.head()

In [None]:
data.info()

In [None]:
msno.matrix(data)

> For this Classification Problem, i'm using features based on geographic, post-harvest information, category defects and total cupping point. I want to know how this variables can affect the flavor quality(Total.Cup.Points - Grading by Coffee Q.Grader) 

In [None]:
df = data[["Country.of.Origin", "Harvest.Year", "Variety", "Processing.Method", "Category.One.Defects", "Category.Two.Defects", "Quakers", "altitude_mean_meters", "Total.Cup.Points"]]
df = df.dropna()
df = df.reset_index()

df = df.drop("index", axis = 1)
df.head()

In [None]:
cleaned_df = df[["Country.of.Origin", "Harvest.Year", "Variety", "Processing.Method", "Category.One.Defects", "Category.Two.Defects", "Quakers", "altitude_mean_meters", "Total.Cup.Points"]]

cleaned_df.head()
cleaned_df.plot(kind='box', subplots=True, layout=(2,3), 
        sharex=False, sharey=False)
plt.show()

In [None]:
#All the data preparation goes here

#Melakukan data cleaning untuk features harvest year
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2017 / 2018", "Harvest.Year"] = "2018"
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2016 / 2017", "Harvest.Year"] = "2017"
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2015/2016", "Harvest.Year"] = "2016"
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2014/2015", "Harvest.Year"] = "2015"
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2013/2014", "Harvest.Year"] = "2014"
cleaned_df.loc[cleaned_df["Harvest.Year"] == "2011/2012", "Harvest.Year"] = "2012"

#Mengkelompokan data negara asal menjadi others untuk negara yang memiliki data kopi dibawah 10
a = cleaned_df['Country.of.Origin'].value_counts() <= 5
b = cleaned_df['Country.of.Origin'].value_counts()
for i in range(len(a.index)):
    if(a[i]):
        cleaned_df.loc[cleaned_df["Country.of.Origin"] == a.index[i], "Country.of.Origin"] = "Others"
        
#Mengkelompokan data negara asal menjadi others untuk negara yang memiliki data kopi dibawah 10
a = cleaned_df['Variety'].value_counts() <= 1
b = cleaned_df['Variety'].value_counts()
for i in range(len(a.index)):
    if(a[i]):
        cleaned_df.loc[cleaned_df["Variety"] == a.index[i], "Variety"] = "Others"

#Menghapus data altitude outliers yaang tidak masuk akal
cleaned_df.drop(cleaned_df.loc[cleaned_df['altitude_mean_meters'] > 2000].index, inplace = True) 
cleaned_df.drop(cleaned_df.loc[cleaned_df['altitude_mean_meters'] < 182].index, inplace = True) 

#Melakukan perbaikan tipe data pada variabel dalam dataset 
cleaned_df.loc[:,"Category.One.Defects"] = cleaned_df["Category.One.Defects"].astype(int)
cleaned_df.loc[:,"Harvest.Year"] = cleaned_df["Harvest.Year"].astype(int)
cleaned_df.loc[:,"Total.Cup.Points"] = cleaned_df["Total.Cup.Points"].astype(float)
cleaned_df["Quakers"] = cleaned_df["Quakers"].astype(int)


#Membuat features grading biji kopi berdasarkan features defects dan quakers
cut_labels = ["Specialty", "Premium", "Exchange", "Below Standard"] # 1 = Specialty Grade, 2 = Premium Coffee Grade, 3 = Exchange Coffee Grade
cut_bins = [-1, 3, 15, 23, 100]
cleaned_df['Green.Beans.Grade'] = cleaned_df["Category.One.Defects"].values + cleaned_df["Category.Two.Defects"].values + cleaned_df["Quakers"].values
cleaned_df['Green.Beans.Grade'] = pd.cut(cleaned_df['Green.Beans.Grade'], bins=cut_bins, labels=cut_labels)

#Membuat features Total Cupping Point menjadi categorical, menjadi features Cupping.Grade
cut_labels = ["UGQ", "Premium", "Specialty"] # 1 = Specialty Quality, 2 = Premium Quality, 3 = Usually Good Quality
cut_bins = [50, 80, 84, 90]
cleaned_df['Cupping.Grade'] = pd.cut(cleaned_df['Total.Cup.Points'], bins=cut_bins, labels=cut_labels)

cleaned_df.info()

> In this dataset, The data are unbalanced, To solve this im gonna do Random Over Sampling

In [None]:
#Mengambil features yang akan digunakan
model_df = cleaned_df[["Country.of.Origin", "Harvest.Year", "Variety", "Processing.Method", "Green.Beans.Grade", 'Cupping.Grade', "Category.One.Defects",	"Category.Two.Defects",	"Quakers"]]
ax = sns.countplot(x="Cupping.Grade", data=model_df)
ax.tick_params(labelsize=15)

In [None]:
model_df = cleaned_df[["Country.of.Origin", "Harvest.Year", "Variety", "Processing.Method", "Green.Beans.Grade", 'Cupping.Grade', "Category.One.Defects",	"Category.Two.Defects",	"Quakers"]]
df1 = model_df.loc[model_df["Cupping.Grade"] == "UGQ"]
df2 = model_df.loc[model_df["Cupping.Grade"] == "Specialty"]
frames = [model_df, df1, df1, df2, df2]
model_df = pd.concat(frames)

ax = sns.countplot(x="Cupping.Grade", data=model_df)
ax.tick_params(labelsize=15)

In [None]:
# Label Encoding
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
encode_df = model_df.copy()
column_name = ["Country.of.Origin", "Harvest.Year", "Variety", "Processing.Method", "Green.Beans.Grade", 'Cupping.Grade']

label = list()
for i in range(0,6):
    encoder.fit(encode_df[column_name[i]])
    encode_df.loc[:,column_name[i]] = (encoder.transform(encode_df[column_name[i]]))
    label.append(encoder.inverse_transform(encode_df[column_name[i]]))

    unique, counts = np.unique(label[i], return_counts=True)
    print(np.asarray((unique, counts)).T)
    unique, counts = np.unique(encode_df.loc[:,column_name[i]], return_counts=True)
    print(np.asarray((unique, counts)).T)

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
X = encode_df.drop("Cupping.Grade", axis = 1)
Y = encode_df["Cupping.Grade"]
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)


In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 9)
rf = RandomForestClassifier(n_estimators = 30, random_state = 42)
# Train the model on training data
rf.fit(x_train, y_train)
y_pred=rf.predict(x_test)

print("Accuracy: %0.5f" % (metrics.accuracy_score(y_test, y_pred)))

In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

import os
os.environ["PATH"] += os.pathsep + 'D:/Anaconda/Library/bin/graphviz'

a = ["Premium", "Specialty", "UGQ"]
labels = X.columns
graph = Source(tree.export_graphviz(rf[10] ,feature_names = labels, class_names = a, max_depth = 2, filled = True))
display(SVG(graph.pipe(format='svg')))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


cm = (confusion_matrix(y_test, y_pred))
a = ["Premium", "Specialty", "UGQ"]

sns.heatmap(cm, xticklabels = a, yticklabels = a,annot=True, fmt='g')
print(classification_report(y_test, y_pred))

In [None]:
import scikitplot as skplt

y_probas = rf.predict_proba(x_test)
skplt.metrics.plot_roc_curve(y_test, y_probas)
plt.show()