-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
252 additions
and
2 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
\documentclass[compress]{beamer} | ||
\setbeamercolor{normal text}{fg=black} | ||
\beamertemplatesolidbackgroundcolor{white} | ||
\setbeamercovered{transparent, still covered={\opaqueness<1->{0}}, again covered={\opaqueness<1->{30}}} | ||
\usecolortheme[named=black]{structure} | ||
\definecolor{links}{HTML}{98AFC7} | ||
\hypersetup{colorlinks,linkcolor=,urlcolor=links} | ||
\usepackage{caption} | ||
\captionsetup{labelformat=empty} | ||
\setbeamertemplate{navigation symbols}{} | ||
%\usefonttheme{structurebold} | ||
|
||
\usepackage[scaled]{helvet} | ||
\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif | ||
\usepackage[T1]{fontenc} | ||
\usepackage{setspace} | ||
%\usepackage{beamerthemesplit} | ||
\usepackage{graphics} | ||
\usepackage{Sweave} | ||
|
||
|
||
\usepackage{hyperref} | ||
\usepackage{graphicx} | ||
\usepackage{verbatim} | ||
\usepackage{amssymb} | ||
\usepackage{wrapfig} | ||
\def\labelitemi{\textemdash} | ||
\setbeamertemplate{frametitle}{ | ||
\begin{centering} | ||
\vskip15pt | ||
\insertframetitle | ||
\par | ||
\end{centering} | ||
} | ||
\title[DS]{\scalebox{.20}{\includegraphics{specialk.png}}\\ $K$-means Clustering} | ||
\author[Sood]{gaurav~sood\\\href{http://www.gsood.com}{http://gsood.com}\\ | ||
\href{https://twitter.com/soodoku}{twitter} \textbf{|} \href{http://github.com/soodoku}{github}} | ||
\large | ||
\date[2015]{\today} | ||
\subject{LearnDS} | ||
\begin{document} | ||
\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}} | ||
\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}} | ||
\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}} | ||
|
||
\newenvironment{large_enum}{ | ||
\Large | ||
\begin{itemize} | ||
\setlength{\itemsep}{7pt} | ||
\setlength{\parskip}{0pt} | ||
\setlength{\parsep}{0pt} | ||
}{\end{itemize}} | ||
|
||
\begin{comment} | ||
|
||
setwd(paste0(githubdir, "data-science/ds6/")) | ||
tools::texi2dvi("kmeans.tex", pdf=TRUE,clean=TRUE) | ||
setwd(basedir) | ||
|
||
\end{comment} | ||
\frame | ||
{ | ||
\titlepage | ||
} | ||
|
||
\frame{ | ||
\frametitle{Unsupervised Learning} | ||
\begin{large_enum} | ||
\item[-]<2> Everything is dimension reduction | ||
\item[-]<3> In supervised learning, labels supervise dimension reduction\\ | ||
\item[-]<4> For instance, regression is about finding a low dimensional representation of $Y$ | ||
\item[-]<5> Supervised Learning $\sim$ Given Apples and Oranges, learn traits of Apples Vs. Oranges | ||
\item[-]<6> Given a bunch of spherical fruits, optimally describe types of fruits | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\frametitle{Ways to Think About Unsupervised Learning} | ||
\begin{large_enum} | ||
\item[-]<2>Learning the probability model of the data $p(x_n|x_1,...,x_{n-1})$ | ||
\item[-]<3>\textbf{Applications:} Outlier detection, Data compression | ||
\item[-]<4>Find rows similar to each other, groups of rows dissimilar to each other | ||
\item[-]<5>Find columns similar to each other, groups of columns dissimilar to each other | ||
\item[-]<6>\textbf{Applications:} Group movies by ratings, Segment shoppers | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\frametitle{Solutions} | ||
\begin{large_enum} | ||
\item[-]<1-3>Two kinds of methods: | ||
\begin{enumerate} | ||
\item[-]<2->Principal components analysis | ||
\item[-]<3->Clustering | ||
\end{enumerate} | ||
\item[-]<4>Clustering looks to partition data into similar subgroups | ||
\item[-]<5-7>Two popular methods: | ||
\begin{enumerate} | ||
\item[-]<6-> Hierarchical clustering (computationally expensive) | ||
\item[-]<7> $k$-means clustering (pre-specify $k$) | ||
\end{enumerate} | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\only<1>{\scriptsize{Source: \href{http://research.microsoft.com/en-us/um/people/cmbishop/prml/}{Pattern Recognition and Machine Learning}}} | ||
|
||
\only<1>{\center{\includegraphics{kmeans-ex.png}}} | ||
} | ||
|
||
\frame{ | ||
\frametitle{$k$-Means Clustering} | ||
\begin{large_enum} | ||
\item[-]<1-3>$k$-means: Assume that we must split data into $k$ clusters | ||
\begin{enumerate} | ||
\item[-]<2-5>Each observation belongs to one cluster | ||
\item[-]<3-5>No observation belongs to more than one cluster | ||
\end{enumerate} | ||
\item[-]<4>Find partitioning that minimizes within cluster variation summed over all $k$ | ||
clusters | ||
\item[-]<5>Euclidean distance between observations, sum it over all observations\\\normalsize | ||
\begin{equation} | ||
\text{min.}_{C_1,\ldots,C_K} \sum_{k=1}^{k} \frac{1}{|C_k|} \sum_{i, i^{'} \in C_k} \sum_{j=1}^{p} (x_{ij} - x_{i^{'}j})^2 | ||
\end{equation} | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\frametitle{$k$-Means (Lloyd's) Algorithm} | ||
\begin{large_enum} | ||
\item[-]<2>Randomly assign observations to 1 of $k$ clusters | ||
\item[-]<3-5>Iterate: | ||
\begin{enumerate} | ||
\item[-]<4-> For each of the $k$ clusters, compute the centroid | ||
\item[-]<5-> Assign each observation to cluster whose centroid is closest | ||
\end{enumerate} | ||
\end{large_enum} | ||
\vspace{5cm} | ||
} | ||
\frame{ | ||
\only<1-6>{\scriptsize{Source: James et al. 2015}} | ||
|
||
\only<1>{\center{\includegraphics{kmeanspic1.png}}}\pause | ||
\only<2>{\center{\includegraphics{kmeanspic2.png}}}\pause | ||
\only<3>{\center{\includegraphics{kmeanspic3.png}}}\pause | ||
\only<4>{\center{\includegraphics{kmeanspic4.png}}}\pause | ||
\only<5>{\center{\includegraphics{kmeanspic5.png}}}\pause | ||
\only<6>{\center{\includegraphics{kmeanspic.png}}} | ||
|
||
} | ||
|
||
\frame{ | ||
\frametitle{$k$-Means Algorithm} | ||
\begin{large_enum} | ||
\item[-]<0>Randomly assign observations to 1 of $k$ clusters | ||
\item[-]<0>Iterate: | ||
\begin{enumerate} | ||
\item[-]<1-> For each of the $k$ clusters, compute the centroid | ||
\item[-]<1-> Assign each observation to cluster whose centroid is closest | ||
\end{enumerate} | ||
\item[-]<2>Why does it work? | ||
\item[-]<3>It doesn't. Local minima possible. | ||
\item[-]<4->Initialization: | ||
\begin{enumerate} | ||
\item[-]<5->Forgy: Randomly choose $k$ observations and set them as centroids. | ||
\item[-]<6->Random Partition: Assign each observation randomly to one of the clusters. | ||
\item[-]<7->Run an alternate clustering algorithm on a small sample and use the clusters as initial centroids | ||
\item[-]<8> Pick dispersed points as centroids. For e.g. $k$-means++ and variations of it. | ||
\end{enumerate} | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\frametitle{Distance between clusters} | ||
\begin{large_enum} | ||
\item[-]<1>Complete Linkage\\\normalsize | ||
Farthest distance between points in clusters | ||
\item[-]<2>Single\\\normalsize | ||
Closest pair | ||
\item[-]<3>Average\\\normalsize | ||
All pairs, and then take the average | ||
\item[-]<4>Centroid\\\normalsize | ||
Has problems called inversions\\ | ||
Used in Genomics | ||
\item[-]<5>Complete and Average most commonly used | ||
\end{large_enum} | ||
} | ||
|
||
\frame{ | ||
\frametitle{Practical Issues} | ||
\begin{large_enum} | ||
\item[-]<1-4>Choice of Similarity Measure | ||
\begin{enumerate} | ||
\item[-]<2->Scaling Matters | ||
\item[-]<3->Jaccard --- can be gotten quickly by minhashing via LSH Distance | ||
\item[-]<4->Correlation based measures (+/- may matter) | ||
\end{enumerate} | ||
\item[-]<5>High dimensional data. Solutions e.g. DANN | ||
\item[-]<6-8>Choosing $k$: | ||
\begin{enumerate} | ||
\item[-]<7->Calculate average distance to centroid for multiple $k$ | ||
\item[-]<8>Plot them, look for the \emph{knee}\\ | ||
\visible<8>{\scalebox{0.5}{\includegraphics{knee.jpg}}} | ||
\end{enumerate} | ||
\end{large_enum} | ||
} | ||
\frame{ | ||
\frametitle{A(N)alyst choose $k$ contd.} | ||
\begin{large_enum} | ||
\item[-]<1-5>Calinski-Harabasz (CH) Index: | ||
\begin{enumerate} | ||
\item[-]<2->Between Cluster, $B = \sum_{1}^k n_k \lVert X_k - \bar{X} \rVert^2$ | ||
\item[-]<3->Within Cluster, $W = \sum_{1}^k \lVert X_i - \bar{X_k} \lVert^2$ | ||
\item[-]<4->Maximize Between Cluster Variation, Minimize Within Cluster Variation | ||
\item[-]<5->$\text{CH(K)} = \frac{B(K)}{(K-1)}\frac{n - K}{W(K)}$ | ||
\end{enumerate} | ||
|
||
\item[-]<6-8>Gap Statistic (Tibshirani): | ||
\begin{enumerate} | ||
\item[-]<6->Compare observed $W(K)$ to $W_{\text{unif}}(K)$ | ||
\item[-]<7->$\text{GAP}(K) = \text{log} W(K) - \text{log} W_{\text{unif}}(K)$ | ||
\item[-]<8->Calculate $W_{\text{unif}}(K)$ by simulation. | ||
\end{enumerate} | ||
\end{large_enum} | ||
} | ||
\frame{ | ||
\frametitle{Running Time} | ||
\begin{large_enum} | ||
\item[-]<1> $O(kn)$ for each iteration. | ||
\item[-]<2> But total iterations can be a lot, and not bounded. | ||
\item[-]<3> But in practice, polynomial running time. | ||
\item[-]<4-6> Big (Long) Data Solutions: | ||
\begin{enumerate} | ||
\item[-]<5->Bradley-Fayyad-Reina (BFR) | ||
\item[-]<6->CURE | ||
\end{enumerate} | ||
\item[-]<7-8>BFR | ||
\begin{enumerate} | ||
\item[-]<7->Assumes clusters are normally distributed around a centroid in Euclidean space. | ||
\item[-]<8->Exploit that to quantify likelihood point belongs to a cluster | ||
\end{enumerate} | ||
\end{large_enum} | ||
} | ||
\end{document} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters