In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preprocessing

In [None]:
import operator
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
sns.distplot(df['pH'])

# Linear Regression vs. Polynomial Regression

# Linear Regression


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

X = df.iloc[:, 0:11].drop('pH', axis=1).values.reshape(1599, 1, 10)
X = X[:, :, 0]
y = df['pH'].values.reshape(df['pH'].size, 1)

# X_train, X_test, y_train, y_test = train_test_split(X , y, random_state = 101, test_size = 0.2)
X.shape

In [None]:
degree = 1


polf = PolynomialFeatures(degree = degree)
X_poly = polf.fit_transform(X)

lm = LinearRegression()
lm.fit(X_poly, y)
pred = lm.predict(X_poly)

plt.figure(figsize=(10, 7))
plt.scatter(X, y, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X, pred), key=sort_axis)
X_, pred_ = zip(*sorted_zip)
plt.plot(X_, pred_, color='m')
plt.show()

# Polynomial Regression

This degree start from 2, and see the plot here

In [None]:
degree = 2


polf = PolynomialFeatures(degree = degree)
X_poly = polf.fit_transform(X)
X_ = X
lm = LinearRegression()
lm.fit(X_poly, y)
pred = lm.predict(X_poly)

plt.figure(figsize=(10, 7))
plt.scatter(X, y, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X_, pred), key=sort_axis)
X_, pred_ = zip(*sorted_zip)
plt.plot(X_, pred_, color='m')
plt.show()

How about the degree is more than 2?

**You can input degree here** 

In [None]:
degree = int(input("Enter the degree: "))


polf = PolynomialFeatures(degree = degree)
X_poly = polf.fit_transform(X)
X_ = X
lm = LinearRegression()
lm.fit(X_poly, y)
pred = lm.predict(X_poly)

plt.figure(figsize=(10, 7))
plt.scatter(X, y, s=10)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X_, pred), key=sort_axis)
X_, pred_ = zip(*sorted_zip)
plt.plot(X_, pred_, color='m')
plt.show()

# **Evaluation Model Regression parameters**

See the score from our Regression model

In [None]:
# Testing the parameters
print('MAE:', metrics.mean_absolute_error(y, pred))
print('MSE:', metrics.mean_squared_error(y, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y, pred)))
print('R2:', metrics.r2_score(y, pred))

Tabel Parameter

In [None]:
from sklearn import metrics
from prettytable import PrettyTable
for i in range(1, 5):
    polf = PolynomialFeatures(degree = i)
    X_poly = polf.fit_transform(X)
    lm = LinearRegression()
    lm.fit(X_poly, y)
    pred = lm.predict(X_poly)
    tabel_parameter = PrettyTable(['Parameter', 'Score'])
    tabel_parameter.add_row(['Polynomial Degree', i])
    tabel_parameter.add_row(['MAE', '{:.10}'.format(metrics.mean_absolute_error(y, pred))])
    tabel_parameter.add_row(['MSE', '{:.10}'.format(metrics.mean_squared_error(y, pred))])
    tabel_parameter.add_row(['RMSE','{:.10}'.format(np.sqrt(metrics.mean_squared_error(y, pred)))])
    tabel_parameter.add_row(['R^2', '{:.10}'.format(metrics.r2_score(y, pred))])
    print(tabel_parameter)

# **Clustering KMeans**


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

Feature Scaling

In [None]:
X = df.iloc[:, 0:11]
y = df['pH']
cols = X.columns
# MinMaxScaler untuk Feature Scaling
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X, columns=[cols])
X.shape

In [None]:
X.head()

**Start initialize with k = 1**

In [None]:
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
pred = kmeans.predict(X)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.labels_

In [None]:
print(kmeans.inertia_)

# Finding k with Elbow Method

The elbow method runs k-means clustering on the dataset for a range of values for k (say from 1-10) and then for each value of k computes an average score for all clusters.

In [None]:
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(X)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs, color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()

Dilihat dari grafik Elbow Curve, titik awal yang membentuk siku di k = 2, tetapi bisa jadi k yang terbaik adalah k = 3

# Finding Balance k with Silhouette Method

In [None]:
from yellowbrick.cluster import silhouette_visualizer
from sklearn.metrics import silhouette_score
n_clusters = 2
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
silhouette_visualizer(KMeans(n_clusters=n_clusters, random_state=42), X, colors='yellowbrick')
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

Berdasarkan plot Silhouette, jumlah rata - rata silhouette yang paling tinggi adalah jumlah cluster = 2, yang artinya k = 2 adalah k yang paling optimal

# Comparing Inertia score from 2 different clusters

You can input and conclude the k you choose was better

In [None]:
# Input k
k = int(input("Enter the k: "))
kmeans = KMeans(n_clusters = k)
kmeans.fit(X)
inertia = kmeans.inertia_
print("Result inertia from k = %d is: %f" % (k, inertia))

# Optimal k
k = 2
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
pred = kmeans.predict(X)
inertia = kmeans.inertia_
print("Result inertia from k = %d is: %f" % (k, inertia))

# Evaluation model Clustering

For checking the k was better and check the accuracy if approaching 1

You can input and conclude the k you choose was better

In [None]:
# Input k
k = int(input("Enter the k: "))
kmeans = KMeans(n_clusters = k)
kmeans.fit(X)

In [None]:
kmeans.labels_

In [None]:
labels = kmeans.labels_
# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)

print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))

# Display a plot Cluster

In [None]:
# Select Alcohol and pH attribute
X = df.iloc[:, [10,8]]
y = df['pH']
# cols = X.columns
# MinMaxScaler untuk scaling data X
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X)

In [None]:
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
pred = kmeans.predict(X)
plt.figure(figsize=(10, 7))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=pred, s=30, cmap="viridis")
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
pred = kmeans.predict(X)
plt.figure(figsize=(10, 7))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=pred, s=30, cmap='viridis')
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
plt.show()