**Tutorial: Wine EDA, Clustering and PCA**

1. Make and EDA for the dataset in order to explain the current variables in the dataset and how they affect the diferent values.
2. Develop a clustering model to compare the results vs the variable "Customer_Segment".
3. Develop a predictic model using PCA and test the output.

**EDA (Exploratory Data Analysis)**

As first step for the EDA, we load the data in order to identify all the variables and values inside our file.

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data
df = pd.read_csv("../input/wine-pca/Wine.csv")
df.head()

In [None]:
df.info()

In [None]:
# Check Null values
df.isnull().sum()

In [None]:
df.describe()

As second step, we proceed to remove "Customer_Segment" from our data, because that colum is the result that we want to get, so for this first process we dont need that value.

In [None]:
df = df.drop(["Customer_Segment"], axis=1)
df.head()

In [None]:
# Density plot for each attribute
fig, ax = plt.subplots(5,3, figsize=(14,12))
axes_ = [axes_row for axes in ax for axes_row in axes]
for i,c in enumerate(df.columns):
    sns.distplot(df[c], ax = axes_[i], color = 'red')
    plt.tight_layout()

In [None]:
# Box plot for each attribute
fig, ax = plt.subplots(5,3, figsize=(14,12))
axes_ = [axes_row for axes in ax for axes_row in axes]
for i,c in enumerate(df.columns):
    sns.boxplot(df[c], ax = axes_[i], color = 'orange')
    plt.tight_layout()

In [None]:
# Correlation matrix
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(method='spearman'),annot=True,fmt=".1f",linewidths=1,ax=ax)
plt.show()

In [None]:
corr = df.corr(method='spearman')
th = 0.6
corr[corr > th]

After this first analysis we notice the following:
* There is a high linear relation between Total_Phenols, Flavanoids, Proanthocyanins and OD280.
* There is a high linear realtion between Alcohol, Color_Intensity and Proline.

Now we can remove some of those variables in order to avoid redundant information which can affect the results of our analysis.

In [None]:
df = df.drop(["Flavanoids","Proanthocyanins","Color_Intensity","OD280","Proline"], axis=1)
df.head()

**Clustering**

Before select any clustering model we have to standardize our data because clustering algorithms need all data in the same scale.

In [None]:
#Data Scalation
from sklearn import preprocessing
new_df= preprocessing.StandardScaler().fit_transform(df)
new_df = pd.DataFrame(new_df, columns=['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium','Total_Phenols','Nonflavanoid_Phenols','Hue'])
new_df.head()

In [None]:
# Visualization of scaled and unscaled data
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))

ax1.set_title('Before Scaling')
sns.kdeplot(df['Alcohol'], ax=ax1)
sns.kdeplot(df['Malic_Acid'], ax=ax1)
sns.kdeplot(df['Ash'], ax=ax1)
sns.kdeplot(df['Ash_Alcanity'], ax=ax1)
sns.kdeplot(df['Magnesium'], ax=ax1)
sns.kdeplot(df['Total_Phenols'], ax=ax1)
sns.kdeplot(df['Nonflavanoid_Phenols'], ax=ax1)
sns.kdeplot(df['Hue'], ax=ax1)
ax2.set_title('After Standard Scaler')
sns.kdeplot(new_df['Alcohol'], ax=ax2)
sns.kdeplot(new_df['Malic_Acid'], ax=ax2)
sns.kdeplot(new_df['Ash'], ax=ax2)
sns.kdeplot(new_df['Ash_Alcanity'], ax=ax2)
sns.kdeplot(new_df['Magnesium'], ax=ax2)
sns.kdeplot(new_df['Total_Phenols'], ax=ax2)
sns.kdeplot(new_df['Nonflavanoid_Phenols'], ax=ax2)
sns.kdeplot(new_df['Hue'], ax=ax2)
plt.show()

Now we are going to idetify the best # of cluster for our data using the elbow method

In [None]:
X = new_df[["Alcohol","Malic_Acid"]].values  
from sklearn.cluster import KMeans
def elbow_method(epsilon, figure=False):
    wcss = [] 
    diff = np.inf
    i = 1
    
    while diff > epsilon:
        print("Iteration Nº Clusters: k: {k}".format(k=i))
        kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,n_init = 10, random_state = 0)
        kmeans.fit(X)
        
        if diff == np.inf:
            diff = kmeans.inertia_
        elif kmeans.inertia_ == 0:
            wcss.append(kmeans.inertia_)
            break
        else:
            diff = (wcss[-1] - kmeans.inertia_)/wcss[-1]
        wcss.append(kmeans.inertia_)
        i += 1
        
    if figure:
        plt.plot(range(0,len(wcss)), wcss)
        plt.title('Elbow Method')
        plt.xlabel('Clusters Number')
        plt.ylabel('WCSS')
        plt.show()
    k = i-1
    return wcss, k

# Results Plot
epsilon = 0.05 
wcss, _ = elbow_method(epsilon, figure=True)    


In [None]:
# Optimal K value
epsilon = 0.33
_, k = elbow_method(epsilon, figure=False)

In [None]:
# K-means
kmeans = KMeans(n_clusters = k, init= 'k-means++', max_iter = 300, n_init =10, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

### Plot clusters 
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'blue',label = 'C1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'red',label = 'C2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green',label = 'C3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan',label = 'C4')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('WinesClusters')
plt.xlabel('X1: Alcohol')
plt.ylabel('X2: Malic_Acid')
plt.legend()
plt.show()


**PCA Model**

In [None]:
df2 = pd.read_csv("../input/wine-pca/Wine.csv")
df2.head()

In [None]:
df2 = df2.drop(["Customer_Segment"], axis=1)
df2.describe()


In [None]:
X = df2.iloc[:, 0:len(df2.columns)-1].values
y = df2.iloc[:, len(df2.columns)-1].values

# Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None) 

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print("Varianza Explicada por cada PC")
print(explained_variance)
var_exp = np.round(np.sum(explained_variance[0:5]),4)
print("Con 5 PC se explicaría el {var}% de la varianza".format(var=var_exp*100))

In [None]:
# Se entrena solo para esas 5 componentes principales
pca = PCA(n_components = 5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print("Varianza Explicada por cada PC")
print(explained_variance)
print("Parámetros del Modelo")
print(pca.components_)
# Visualizacion de las PC
sns.barplot(x='PC',y="var",
     data=pd.DataFrame({'var':explained_variance, 'PC':['PC1','PC2','PC3','PC4', 'PC5']}), color="c")

In [None]:
### Modelo de Regresión
# Regresion Lineal
import statsmodels.api as sm
model = sm.OLS(y_train, X_train_pca).fit()
model.summary() 

In [None]:
# RF
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
model.fit(X_train_pca, y_train)
print("Relevancia de los parámetros")
print(model.feature_importances_) 

In [None]:
# Usando 2 PC
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print("Varianza Explicada por cada PC")
print(explained_variance)
print("Parámetros del Modelo")
print(pca.components_)


In [None]:
model = RandomForestRegressor(max_depth=5, random_state=0,
 n_estimators=100)
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)
r2 = r2_score(y_test, y_pred)
mae = mean_squared_error(y_test, y_pred)
print("r2: ", r2, "mae: ", mae)

In [None]:
plt.scatter(X_train_pca[:,0], X_train_pca[:,1])
plt.ylabel("PC1")
plt.xlabel("PC2")
plt.title("Representación Gráfica de las PC")
plt.show()