Skip to content

Commit

Permalink
k-means added
Browse files Browse the repository at this point in the history
  • Loading branch information
soodoku committed Sep 17, 2015
1 parent c9a6d92 commit 81a9724
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 2 deletions.
Binary file added ds6/kmeans.pdf
Binary file not shown.
244 changes: 244 additions & 0 deletions ds6/kmeans.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
\documentclass[compress]{beamer}
\setbeamercolor{normal text}{fg=black}
\beamertemplatesolidbackgroundcolor{white}
\setbeamercovered{transparent, still covered={\opaqueness<1->{0}}, again covered={\opaqueness<1->{30}}}
\usecolortheme[named=black]{structure}
\definecolor{links}{HTML}{98AFC7}
\hypersetup{colorlinks,linkcolor=,urlcolor=links}
\usepackage{caption}
\captionsetup{labelformat=empty}
\setbeamertemplate{navigation symbols}{}
%\usefonttheme{structurebold}

\usepackage[scaled]{helvet}
\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif
\usepackage[T1]{fontenc}
\usepackage{setspace}
%\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{Sweave}


\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{wrapfig}
\def\labelitemi{\textemdash}
\setbeamertemplate{frametitle}{
\begin{centering}
\vskip15pt
\insertframetitle
\par
\end{centering}
}
\title[DS]{\scalebox{.20}{\includegraphics{specialk.png}}\\ $K$-means Clustering}
\author[Sood]{gaurav~sood\\\href{http://www.gsood.com}{http://gsood.com}\\
\href{https://twitter.com/soodoku}{twitter} \textbf{|} \href{http://github.com/soodoku}{github}}
\large
\date[2015]{\today}
\subject{LearnDS}
\begin{document}
\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}}
\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}}
\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}}

\newenvironment{large_enum}{
\Large
\begin{itemize}
\setlength{\itemsep}{7pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
}{\end{itemize}}

\begin{comment}

setwd(paste0(githubdir, "data-science/ds6/"))
tools::texi2dvi("kmeans.tex", pdf=TRUE,clean=TRUE)
setwd(basedir)

\end{comment}
\frame
{
\titlepage
}

\frame{
\frametitle{Unsupervised Learning}
\begin{large_enum}
\item[-]<2> Everything is dimension reduction
\item[-]<3> In supervised learning, labels supervise dimension reduction\\
\item[-]<4> For instance, regression is about finding a low dimensional representation of $Y$
\item[-]<5> Supervised Learning $\sim$ Given Apples and Oranges, learn traits of Apples Vs. Oranges
\item[-]<6> Given a bunch of spherical fruits, optimally describe types of fruits
\end{large_enum}
}

\frame{
\frametitle{Ways to Think About Unsupervised Learning}
\begin{large_enum}
\item[-]<2>Learning the probability model of the data $p(x_n|x_1,...,x_{n-1})$
\item[-]<3>\textbf{Applications:} Outlier detection, Data compression
\item[-]<4>Find rows similar to each other, groups of rows dissimilar to each other
\item[-]<5>Find columns similar to each other, groups of columns dissimilar to each other
\item[-]<6>\textbf{Applications:} Group movies by ratings, Segment shoppers
\end{large_enum}
}

\frame{
\frametitle{Solutions}
\begin{large_enum}
\item[-]<1-3>Two kinds of methods:
\begin{enumerate}
\item[-]<2->Principal components analysis
\item[-]<3->Clustering
\end{enumerate}
\item[-]<4>Clustering looks to partition data into similar subgroups
\item[-]<5-7>Two popular methods:
\begin{enumerate}
\item[-]<6-> Hierarchical clustering (computationally expensive)
\item[-]<7> $k$-means clustering (pre-specify $k$)
\end{enumerate}
\end{large_enum}
}

\frame{
\only<1>{\scriptsize{Source: \href{http://research.microsoft.com/en-us/um/people/cmbishop/prml/}{Pattern Recognition and Machine Learning}}}

\only<1>{\center{\includegraphics{kmeans-ex.png}}}
}

\frame{
\frametitle{$k$-Means Clustering}
\begin{large_enum}
\item[-]<1-3>$k$-means: Assume that we must split data into $k$ clusters
\begin{enumerate}
\item[-]<2-5>Each observation belongs to one cluster
\item[-]<3-5>No observation belongs to more than one cluster
\end{enumerate}
\item[-]<4>Find partitioning that minimizes within cluster variation summed over all $k$
clusters
\item[-]<5>Euclidean distance between observations, sum it over all observations\\\normalsize
\begin{equation}
\text{min.}_{C_1,\ldots,C_K} \sum_{k=1}^{k} \frac{1}{|C_k|} \sum_{i, i^{'} \in C_k} \sum_{j=1}^{p} (x_{ij} - x_{i^{'}j})^2
\end{equation}
\end{large_enum}
}

\frame{
\frametitle{$k$-Means (Lloyd's) Algorithm}
\begin{large_enum}
\item[-]<2>Randomly assign observations to 1 of $k$ clusters
\item[-]<3-5>Iterate:
\begin{enumerate}
\item[-]<4-> For each of the $k$ clusters, compute the centroid
\item[-]<5-> Assign each observation to cluster whose centroid is closest
\end{enumerate}
\end{large_enum}
\vspace{5cm}
}
\frame{
\only<1-6>{\scriptsize{Source: James et al. 2015}}

\only<1>{\center{\includegraphics{kmeanspic1.png}}}\pause
\only<2>{\center{\includegraphics{kmeanspic2.png}}}\pause
\only<3>{\center{\includegraphics{kmeanspic3.png}}}\pause
\only<4>{\center{\includegraphics{kmeanspic4.png}}}\pause
\only<5>{\center{\includegraphics{kmeanspic5.png}}}\pause
\only<6>{\center{\includegraphics{kmeanspic.png}}}

}

\frame{
\frametitle{$k$-Means Algorithm}
\begin{large_enum}
\item[-]<0>Randomly assign observations to 1 of $k$ clusters
\item[-]<0>Iterate:
\begin{enumerate}
\item[-]<1-> For each of the $k$ clusters, compute the centroid
\item[-]<1-> Assign each observation to cluster whose centroid is closest
\end{enumerate}
\item[-]<2>Why does it work?
\item[-]<3>It doesn't. Local minima possible.
\item[-]<4->Initialization:
\begin{enumerate}
\item[-]<5->Forgy: Randomly choose $k$ observations and set them as centroids.
\item[-]<6->Random Partition: Assign each observation randomly to one of the clusters.
\item[-]<7->Run an alternate clustering algorithm on a small sample and use the clusters as initial centroids
\item[-]<8> Pick dispersed points as centroids. For e.g. $k$-means++ and variations of it.
\end{enumerate}
\end{large_enum}
}

\frame{
\frametitle{Distance between clusters}
\begin{large_enum}
\item[-]<1>Complete Linkage\\\normalsize
Farthest distance between points in clusters
\item[-]<2>Single\\\normalsize
Closest pair
\item[-]<3>Average\\\normalsize
All pairs, and then take the average
\item[-]<4>Centroid\\\normalsize
Has problems called inversions\\
Used in Genomics
\item[-]<5>Complete and Average most commonly used
\end{large_enum}
}

\frame{
\frametitle{Practical Issues}
\begin{large_enum}
\item[-]<1-4>Choice of Similarity Measure
\begin{enumerate}
\item[-]<2->Scaling Matters
\item[-]<3->Jaccard --- can be gotten quickly by minhashing via LSH Distance
\item[-]<4->Correlation based measures (+/- may matter)
\end{enumerate}
\item[-]<5>High dimensional data. Solutions e.g. DANN
\item[-]<6-8>Choosing $k$:
\begin{enumerate}
\item[-]<7->Calculate average distance to centroid for multiple $k$
\item[-]<8>Plot them, look for the \emph{knee}\\
\visible<8>{\scalebox{0.5}{\includegraphics{knee.jpg}}}
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{A(N)alyst choose $k$ contd.}
\begin{large_enum}
\item[-]<1-5>Calinski-Harabasz (CH) Index:
\begin{enumerate}
\item[-]<2->Between Cluster, $B = \sum_{1}^k n_k \lVert X_k - \bar{X} \rVert^2$
\item[-]<3->Within Cluster, $W = \sum_{1}^k \lVert X_i - \bar{X_k} \lVert^2$
\item[-]<4->Maximize Between Cluster Variation, Minimize Within Cluster Variation
\item[-]<5->$\text{CH(K)} = \frac{B(K)}{(K-1)}\frac{n - K}{W(K)}$
\end{enumerate}

\item[-]<6-8>Gap Statistic (Tibshirani):
\begin{enumerate}
\item[-]<6->Compare observed $W(K)$ to $W_{\text{unif}}(K)$
\item[-]<7->$\text{GAP}(K) = \text{log} W(K) - \text{log} W_{\text{unif}}(K)$
\item[-]<8->Calculate $W_{\text{unif}}(K)$ by simulation.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Running Time}
\begin{large_enum}
\item[-]<1> $O(kn)$ for each iteration.
\item[-]<2> But total iterations can be a lot, and not bounded.
\item[-]<3> But in practice, polynomial running time.
\item[-]<4-6> Big (Long) Data Solutions:
\begin{enumerate}
\item[-]<5->Bradley-Fayyad-Reina (BFR)
\item[-]<6->CURE
\end{enumerate}
\item[-]<7-8>BFR
\begin{enumerate}
\item[-]<7->Assumes clusters are normally distributed around a centroid in Euclidean space.
\item[-]<8->Exploit that to quantify likelihood point belongs to a cluster
\end{enumerate}
\end{large_enum}
}
\end{document}
10 changes: 8 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,20 @@ Data Science: Some Basics
- Assessing model fit
- Clarification about Big Data

5. Presenting Analyses
5. Supervised Methods

6. Unsupervised Methods
- PCA, CA
- k-means ([presentation](ds6/kmeans.pdf), [tex](ds6/kmeans.tex))

7. Presenting Analyses
- [ggplot2 in brief](graphs/ggplot2.md)
- Examples of ggplot in action:
- NYT Civil Rights Coverage ([R code](https://github.com/soodoku/nyt-civil-rights/blob/master/plot.R), [Graph](https://github.com/soodoku/nyt-civil-rights/blob/master/nyt_aa.pdf))
- Military Experience of UK Prime Ministers ([R code](https://github.com/soodoku/military-experience/blob/master/mil_plots.R), [Graph](https://github.com/soodoku/military-experience/blob/master/ukmil.pdf))
- [Suggestions for writing](http://gbytes.gsood.com/on-writing/)

6. Some Applications
8. Some Applications
- From paper to digital ([presentation](app/PaperToDigital.pdf), [tex](app/PaperToDigital.tex))
- Text as Data
- [Sentiment Analysis](https://gist.github.com/soodoku/22e4cff2eb6a05be3c0d)
Expand Down

0 comments on commit 81a9724

Please sign in to comment.