# Clustering and Frequent Item Sets

#### In this section, we will learn about how to use k-means, DBSCAN, Apriori and FP-Growth in python.


Following libraries are used for this algorithms:
- pandas
- numpy
- matplotlib
- mlxtend
- sklearn

In [None]:
# Import necessary modules

import numpy as np
import pandas as pd
import csv
from matplotlib import pyplot as plt

##  Read and explore  Weather.csv

### Read data from Weather.csv

We use Weather.csv file as a data set for applying k-means, DBSCAN and Apriori algorithms. 

In [None]:
# Read data from the target CSV file 'Weather.csv', store the file contents in variable 'weather' which is Pandas.DataFrame

weather = pd.read_csv('Weather.csv')
weather.size
print(weather)

In [None]:
# Show first five rows in 'weather'
weather.head(5)

In [None]:
# Show last five rows in 'weather'
weather.tail(5)

In [None]:
# Show the name of the attributes in 'weather'
weather.columns

### Draw a diagram for finding distribution of data

In [None]:
# Make a variable 'temerature' which is an array of values from column 'Temperature' in variable 'weather'.

# Make a variable 'humidity' which is an array of values from column 'Humidity' in variable 'weather'.

temperature =weather['Temperature'].values
humidity= weather['Humidity'].values

# Draw a scatter diagram for showing the distribution of data item in variable 'information' 

plt.rcParams['figure.figsize'] = (17, 10)
plt.scatter(temperature, humidity, c='green', s =10)

##  Experiments on k-means and DBSCAN by using Weather.csv
For applying k-means and DBSCAN we should use sklearn library.

In [None]:
# Import k-means module and DBSCAN module from sklearn

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

###  K-means 

In [None]:
# Create variable 'weather_data'. Each row's of that is corresponding to the row in variable 'Weather' 

weather_data = np.array(list(zip(temperature, humidity)))

#Define 'cluster_num' for for number of clusters.

cluster_num = 3
kmeans = KMeans(cluster_num).fit(weather_data)
clusters = kmeans.labels_
centroids = kmeans.cluster_centers_

fig, plots = plt.subplots()
colors = ['r', 'g', 'b', 'y', 'm']
for cluster_index in range(cluster_num) :
    sub_set = np.array([weather_data[i] for i in range(len(weather_data)) if clusters[i] == cluster_index])
    if len(sub_set) == 0 :
        continue
    plots.scatter(sub_set[:,0], sub_set[:,1], s = 10, c = colors[cluster_index])
plots.scatter(centroids[:,0], centroids[:,1], marker = '*', s = 300, c = 'k')

#### Question: Change the 'cluster_num' variable from 2 to 5. Which value clusters the data better?

### DBSCAN


In [None]:
# Create variable 'weather_data'. Each row's of that is corresponding to the row in variable 'Weather' 
weather_data = np.array(list(zip(temperature, humidity)))

#Define 'eps' variable 4 and min_samples varibale 8.

dbscan = DBSCAN(eps = 4, min_samples=8).fit(weather_data)
clusters = dbscan.labels_
cluster_indexs = np.unique(clusters).tolist()
print(cluster_indexs)

fig, plots = plt.subplots()
colors = ['r', 'g', 'b', 'y', 'm']
for cluster_index in cluster_indexs :
    sub_set = np.array([weather_data[i] for i in range(len(weather_data)) if clusters[i] == cluster_index])
    if len(sub_set) == 0 :
        continue
    plots.scatter(sub_set[:,0], sub_set[:,1], s = 10, c = colors[cluster_index])

#### Question: Change the 'eps' variable from 4 to 7 and min_samples from 8 to 11. Which value clusters the data better?

#### Question: Compare the result to the result from k-means when 'cluster_num = 3'. which result is better and why ?

### Dendograms

In [None]:

weather = pd.read_csv('Weather2.csv')
weather.size
print(weather)

In [None]:

data=weather
print(data.head())

from sklearn.preprocessing import normalize
data_scaled = normalize(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
print(data_scaled.head())

import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))
plt.axhline(y=2, color='b', linestyle='--')

plt.show()

### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
cluster.fit_predict(data_scaled)

plt.figure(figsize=(10, 7))
plt.scatter(data_scaled['Temperature'], data_scaled['Humidity'], c=cluster.labels_)

plt.show()

###  Apriori algorithm 

In [None]:
# Import Apriori module and TransactionEncoder module from mlxtend

from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules as arule

In [None]:
# Read data from file repair.csv and 

repair_data = []
with open("Repair.csv") as csvFile:
    reader = csv.reader(csvFile)
    for row in reader:
        repair_data.append(row)
        
#Show 3 first rows of repair_data

repair_data[0:3][:]

In [None]:
# learn to use TransactionEncoder module to convert an array to DataFrame for Apriori algorithm in mlxtend

te = TransactionEncoder()
te_ary = te.fit(repair_data).transform(repair_data)
data = pd.DataFrame(te_ary, columns = te.columns_)
data.head(10)

In [None]:
# learn to use Apriori algorithm from mlxtend

frequent_itemsets = apriori(data, min_support = 0.4, use_colnames = True)
frequent_itemsets

#### Question: Change the value of min_support. What is the effect of that on number of itemsets?