In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns


from scipy import stats
from scipy.stats import norm, skew

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
arabica_data = pd.read_csv('../input/coffee-quality-database-from-cqi/arabica_data_cleaned.csv')
arabica_data.head()

# Clustering
### Reference(my notebook) : [Coffee_Clustering](https://www.kaggle.com/choihanbin/coffee-clustering)

In [None]:
arabica_data_wet = arabica_data.loc[arabica_data['Processing.Method'] == 'Washed / Wet']

# variety가 없는 경우 Other로 묶기
arabica_data_wet['Variety'] = arabica_data_wet['Variety'].fillna('Other')

arabica_data_wet['Coffee_Name'] = [arabica_data_wet['Country.of.Origin'].iloc[i] + '_' + arabica_data_wet['Variety'].iloc[i]
                                     if arabica_data_wet['Variety'].iloc[i] != 'Other'
                                     else arabica_data_wet['Country.of.Origin'].iloc[i]
                                     for i in range(len(arabica_data_wet))]

In [None]:
# Ordering
tastes = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']
uniformity_sweetness = ['Uniformity', 'Sweetness']


for i in range(len(tastes)):
    arabica_data_wet['{}_Rating'.format(tastes[i])] = 0

for i in range(len(arabica_data_wet)):
    ratings = arabica_data_wet[tastes].iloc[i].sort_values(ascending = False).index
    for rating in range(len(ratings)):
        arabica_data_wet['{}_Rating'.format(ratings[rating])].iloc[i] = rating + 1

In [None]:
### Ordering(순서를 매겨, 6 * 5 * 4 * 3 * 2 * 1의 컬럼개수만큼 만듦, 일단 데이터의 개수가 적어 해당 features는 만들지 않음.)

"""arabica_data_wet['Ordering'] = 0
Ordering = set()

features = tastes + uniformity_sweetnees + ['Ordering']
for column in arabica_data_wet.columns:
    if column in features:
        coffee[column] = arabica_data_wet[column]
"""

"""
# categorical feature : 더미변수후 cluster_data와 합침
Order_dummies =  pd.concat((coffee['Coffee_Name'], pd.get_dummies(coffee['Ordering'])), axis = 1).groupby('Coffee_Name').max().reset_index()
cluster_data = pd.concat((cluster_data, Order_dummies.drop(['Coffee_Name'], axis = 1)), axis = 1)
cluster_data.shape
"""

In [None]:
# numerical features : 커피 종류에 따른 mean value 추가
cluster_data = pd.DataFrame({'Coffee_Name' : arabica_data_wet.groupby('Coffee_Name')['Aroma'].mean().index})
ratings = ['Aroma_Rating', 'Flavor_Rating', 'Aftertaste_Rating', 'Acidity_Rating', 'Body_Rating', 'Balance_Rating']
features = tastes + uniformity_sweetness + ratings
for column in features:
    cluster_data[column] = arabica_data_wet.groupby('Coffee_Name')[column].mean().values
    

In [None]:
# TSNE에는 정규화를 거치지 않은 X
model = TSNE(n_components = 2, random_state = 0, perplexity = 50)
tsne = model.fit_transform(cluster_data.drop(['Coffee_Name'], axis = 1).values)

# PCA에는 정규화된 X
std = StandardScaler()
s = std.fit_transform(cluster_data.drop(['Coffee_Name'], axis = 1))

pca = PCA(n_components = 7)
pca.fit(cluster_data.drop(['Coffee_Name'], axis = 1))
pc = pca.transform(cluster_data.drop(['Coffee_Name'], axis = 1))
kmeans = KMeans(n_clusters = 35)
kmeans.fit(pc)

fr = pd.DataFrame({'tsne1' : tsne[:,0], 'tsne2' : tsne[:, 1], 'cluster' : kmeans.labels_})
#sns.lmplot(data = fr, x = 'tsne1', y = 'tsne2', hue = 'cluster', fit_reg = False)
print(np.sum(pca.explained_variance_ratio_))

cluster_data['Cluster'] = kmeans.labels_

# Blending Classification by cluster.

In [None]:
# Blending Classification by Cluster
# Country : [Country1, Country2 ...]
# Rate : [0.4, 0.2, ...] / {X1 + X2 + ... + Xn = 1}

def blending_clustering(Country, Rate, Variety = None):
    blend = 0
    for i in range(len(Country)):
        if Variety == None:
            blend += arabica_data_wet.loc[arabica_data_wet['Country.of.Origin'] == Country[i]][features].mean().apply(lambda x: x * Rate[i])
        else:
            if Variety[i] == None:
                blend += arabica_data_wet.loc[arabica_data_wet['Country.of.Origin'] == Country[i]][features].mean().apply(lambda x: x * Rate[i])
            else:
                blend += arabica_data_wet.loc[arabica_data_wet['Coffee_Name'] == "{}_{}".format(Country[i], Variety[i])][features].mean().apply(lambda x: x * Rate[i])
    blend = pd.DataFrame([blend])
    
    
    # Modeling : DecistionTreeClassifier()
    
    model = DecisionTreeClassifier()
    model.fit(cluster_data.drop(['Cluster', 'Coffee_Name'], axis = 1), kmeans.labels_)
    print(blend)
    model.predict(blend)
    
    print("\n cluster는 '{}'입니다. \n".format(int(model.predict(blend))))
    print("\n 같은 cluster 안에 '{}'이 있습니다.".format(list(cluster_data['Coffee_Name'].loc[cluster_data['Cluster'] == model.predict(blend)[0]])))
    
    return 

# Checking Country in this dataset.
def check_Country():
    return set(arabica_data_wet['Country.of.Origin'])

# Checking Variety in the Country
def check_Variety(Country):
    return set(arabica_data_wet['Variety'].loc[arabica_data_wet['Country.of.Origin'] == Country].values)



In [None]:
check_Country()

In [None]:
check_Variety('Brazil')

In [None]:
Country = ['Costa Rica', 'Ethiopia', 'Colombia', 'Brazil']
Rate = [.25, .25, .25, .25]
Variety = [None, 'Ethiopian Yirgacheffe', None, None]
blending_clustering(Country, Rate, Variety)

# Summary

### blending
Although these arithmetic mean can't be present for blending's tastes. When Checking the clusters, it's classification is not bad. However if I develop classification models in function of blending_clustering, like Logistick Regression, RandomForest or SVM and ensemble them, It could be more accurate. I will upgrade them.