k-means added

soodoku · Sep 17, 2015 · 81a9724 · 81a9724
1 parent c9a6d92
commit 81a9724
Show file tree

Hide file tree

Showing 3 changed files with 252 additions and 2 deletions.
diff --git a/ds6/kmeans.pdf b/ds6/kmeans.pdf
diff --git a/ds6/kmeans.tex b/ds6/kmeans.tex
@@ -0,0 +1,244 @@
+\documentclass[compress]{beamer}
+\setbeamercolor{normal text}{fg=black}
+\beamertemplatesolidbackgroundcolor{white}
+\setbeamercovered{transparent, still covered={\opaqueness<1->{0}}, again covered={\opaqueness<1->{30}}}
+\usecolortheme[named=black]{structure}
+\definecolor{links}{HTML}{98AFC7}
+\hypersetup{colorlinks,linkcolor=,urlcolor=links}
+\usepackage{caption}
+\captionsetup{labelformat=empty}
+\setbeamertemplate{navigation symbols}{} 
+%\usefonttheme{structurebold}
+
+\usepackage[scaled]{helvet}
+\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif
+\usepackage[T1]{fontenc}
+\usepackage{setspace}
+%\usepackage{beamerthemesplit}
+\usepackage{graphics}
+\usepackage{Sweave}
+
+
+\usepackage{hyperref}
+\usepackage{graphicx}
+\usepackage{verbatim}
+\usepackage{amssymb}
+\usepackage{wrapfig}
+\def\labelitemi{\textemdash}
+\setbeamertemplate{frametitle}{
+	\begin{centering}
+		\vskip15pt
+		\insertframetitle
+		\par
+	\end{centering}
+}
+\title[DS]{\scalebox{.20}{\includegraphics{specialk.png}}\\ $K$-means Clustering}
+\author[Sood]{gaurav~sood\\\href{http://www.gsood.com}{http://gsood.com}\\
+\href{https://twitter.com/soodoku}{twitter} \textbf{|} \href{http://github.com/soodoku}{github}}
+\large
+\date[2015]{\today}
+\subject{LearnDS}
+\begin{document}
+\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}}
+\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}}
+\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}}
+
+\newenvironment{large_enum}{
+\Large
+\begin{itemize}
+  \setlength{\itemsep}{7pt}
+  \setlength{\parskip}{0pt}
+  \setlength{\parsep}{0pt}
+}{\end{itemize}}
+
+\begin{comment}
+
+setwd(paste0(githubdir, "data-science/ds6/"))
+tools::texi2dvi("kmeans.tex", pdf=TRUE,clean=TRUE)
+setwd(basedir)
+
+\end{comment}
+  \frame
+  {
+    \titlepage
+  }
+
+\frame{
+  \frametitle{Unsupervised Learning}
+   \begin{large_enum}
+	   \item[-]<2> Everything is dimension reduction
+	   \item[-]<3> In supervised learning, labels supervise dimension reduction\\
+	   \item[-]<4> For instance, regression is about finding a low dimensional representation of $Y$
+	   \item[-]<5> Supervised Learning $\sim$ Given Apples and Oranges, learn traits of Apples Vs. Oranges
+	   \item[-]<6> Given a bunch of spherical fruits, optimally describe types of fruits 
+   \end{large_enum}
+}
+
+\frame{
+	\frametitle{Ways to Think About Unsupervised Learning}
+	\begin{large_enum}
+		\item[-]<2>Learning the probability model of the data $p(x_n|x_1,...,x_{n-1})$
+		\item[-]<3>\textbf{Applications:} Outlier detection, Data compression
+		\item[-]<4>Find rows similar to each other, groups of rows dissimilar to each other
+		\item[-]<5>Find columns similar to each other, groups of columns dissimilar to each other
+		\item[-]<6>\textbf{Applications:} Group movies by ratings, Segment shoppers 
+ \end{large_enum}
+}
+
+\frame{
+\frametitle{Solutions}
+\begin{large_enum}
+	\item[-]<1-3>Two kinds of methods:
+	\begin{enumerate}
+		\item[-]<2->Principal components analysis
+		\item[-]<3->Clustering
+	\end{enumerate}
+	\item[-]<4>Clustering looks to partition data into similar subgroups
+	\item[-]<5-7>Two popular methods:
+		\begin{enumerate}
+			\item[-]<6-> Hierarchical clustering (computationally expensive) 
+			\item[-]<7> $k$-means clustering (pre-specify $k$)
+		\end{enumerate}
+	\end{large_enum}
+}
+
+\frame{
+\only<1>{\scriptsize{Source: \href{http://research.microsoft.com/en-us/um/people/cmbishop/prml/}{Pattern Recognition and Machine Learning}}}
+
+	\only<1>{\center{\includegraphics{kmeans-ex.png}}}
+}
+
+\frame{
+\frametitle{$k$-Means Clustering}
+	\begin{large_enum}
+		\item[-]<1-3>$k$-means: Assume that we must split data into $k$ clusters
+		  \begin{enumerate}
+			 \item[-]<2-5>Each observation belongs to one cluster
+			 \item[-]<3-5>No observation belongs to more than one cluster
+			\end{enumerate}
+		\item[-]<4>Find partitioning that minimizes within cluster variation summed over all $k$
+							clusters
+		\item[-]<5>Euclidean distance between observations, sum it over all observations\\\normalsize
+			\begin{equation}
+			\text{min.}_{C_1,\ldots,C_K} \sum_{k=1}^{k} \frac{1}{|C_k|} \sum_{i, i^{'} \in C_k} \sum_{j=1}^{p} (x_{ij} - x_{i^{'}j})^2
+			\end{equation}
+	\end{large_enum}
+}		
+
+\frame{
+\frametitle{$k$-Means (Lloyd's) Algorithm}
+	\begin{large_enum}
+		\item[-]<2>Randomly assign observations to 1 of $k$ clusters
+		\item[-]<3-5>Iterate:
+			\begin{enumerate}
+			\item[-]<4-> For each of the $k$ clusters, compute the centroid
+			\item[-]<5-> Assign each observation to cluster whose centroid is closest  
+			\end{enumerate}
+	\end{large_enum}
+	\vspace{5cm}
+}		
+\frame{
+\only<1-6>{\scriptsize{Source: James et al. 2015}}
+
+	\only<1>{\center{\includegraphics{kmeanspic1.png}}}\pause
+	\only<2>{\center{\includegraphics{kmeanspic2.png}}}\pause
+	\only<3>{\center{\includegraphics{kmeanspic3.png}}}\pause
+	\only<4>{\center{\includegraphics{kmeanspic4.png}}}\pause
+	\only<5>{\center{\includegraphics{kmeanspic5.png}}}\pause
+	\only<6>{\center{\includegraphics{kmeanspic.png}}}
+
+}
+
+\frame{
+ \frametitle{$k$-Means Algorithm}
+	\begin{large_enum}
+		\item[-]<0>Randomly assign observations to 1 of $k$ clusters
+		\item[-]<0>Iterate:
+			\begin{enumerate}
+			\item[-]<1-> For each of the $k$ clusters, compute the centroid
+			\item[-]<1-> Assign each observation to cluster whose centroid is closest  
+			\end{enumerate}
+		\item[-]<2>Why does it work?
+		\item[-]<3>It doesn't. Local minima possible. 
+		\item[-]<4->Initialization:
+			\begin{enumerate}
+				\item[-]<5->Forgy: Randomly choose $k$ observations and set them as centroids. 
+				\item[-]<6->Random Partition: Assign each observation randomly to one of the clusters.
+				\item[-]<7->Run an alternate clustering algorithm on a small sample and use the clusters as initial centroids
+				\item[-]<8> Pick dispersed points as centroids. For e.g. $k$-means++ and variations of it.
+			\end{enumerate}
+	\end{large_enum}
+}		
+
+\frame{
+\frametitle{Distance between clusters}
+	\begin{large_enum}
+		\item[-]<1>Complete Linkage\\\normalsize
+					Farthest distance between points in clusters
+		\item[-]<2>Single\\\normalsize
+					Closest pair
+		\item[-]<3>Average\\\normalsize
+					All pairs, and then take the average 
+		\item[-]<4>Centroid\\\normalsize
+					Has problems called inversions\\
+					Used in Genomics
+		\item[-]<5>Complete and Average most commonly used
+	\end{large_enum}
+}
+
+\frame{
+\frametitle{Practical Issues}
+	\begin{large_enum}
+		\item[-]<1-4>Choice of Similarity Measure
+			\begin{enumerate}
+				\item[-]<2->Scaling Matters 
+				\item[-]<3->Jaccard --- can be gotten quickly by minhashing via LSH  Distance
+				\item[-]<4->Correlation based measures (+/- may matter)
+			\end{enumerate}
+		\item[-]<5>High dimensional data. Solutions e.g. DANN
+		\item[-]<6-8>Choosing $k$:
+		\begin{enumerate}
+			\item[-]<7->Calculate average distance to centroid for multiple $k$
+			\item[-]<8>Plot them, look for the \emph{knee}\\
+			\visible<8>{\scalebox{0.5}{\includegraphics{knee.jpg}}}
+		\end{enumerate}
+	\end{large_enum}		
+}
+\frame{
+\frametitle{A(N)alyst choose $k$ contd.}
+	\begin{large_enum}
+		\item[-]<1-5>Calinski-Harabasz (CH) Index:
+		 \begin{enumerate}
+			\item[-]<2->Between Cluster, $B = \sum_{1}^k n_k \lVert X_k - \bar{X} \rVert^2$
+			\item[-]<3->Within Cluster,  $W = \sum_{1}^k \lVert X_i - \bar{X_k} \lVert^2$
+			\item[-]<4->Maximize Between Cluster Variation, Minimize Within Cluster Variation
+			\item[-]<5->$\text{CH(K)} = \frac{B(K)}{(K-1)}\frac{n - K}{W(K)}$
+		  \end{enumerate}
+
+		\item[-]<6-8>Gap Statistic (Tibshirani):
+		 \begin{enumerate}
+			\item[-]<6->Compare observed $W(K)$ to $W_{\text{unif}}(K)$
+			\item[-]<7->$\text{GAP}(K) = \text{log} W(K) - \text{log} W_{\text{unif}}(K)$
+			\item[-]<8->Calculate $W_{\text{unif}}(K)$ by simulation.
+		  \end{enumerate}
+	\end{large_enum}
+}
+\frame{
+\frametitle{Running Time}
+	\begin{large_enum}
+		\item[-]<1> $O(kn)$ for each iteration.
+		\item[-]<2> But total iterations can be a lot, and not bounded. 
+		\item[-]<3> But in practice, polynomial running time. 
+		\item[-]<4-6> Big (Long) Data Solutions: 
+		  \begin{enumerate}
+			  \item[-]<5->Bradley-Fayyad-Reina (BFR) 
+			  \item[-]<6->CURE
+			\end{enumerate}
+		\item[-]<7-8>BFR
+			\begin{enumerate}
+			   \item[-]<7->Assumes clusters are normally distributed around a centroid in Euclidean space.
+			   \item[-]<8->Exploit that to quantify likelihood point belongs to a cluster
+			\end{enumerate}
+	\end{large_enum}
+}
+\end{document}
diff --git a/readme.md b/readme.md
@@ -39,14 +39,20 @@ Data Science: Some Basics
    - Assessing model fit
    - Clarification about Big Data
 
- 5. Presenting Analyses
+ 5. Supervised Methods
+
+ 6. Unsupervised Methods
+    - PCA, CA
+    - k-means ([presentation](ds6/kmeans.pdf), [tex](ds6/kmeans.tex))
+
+ 7. Presenting Analyses
    - [ggplot2 in brief](graphs/ggplot2.md)
    - Examples of ggplot in action: 
       - NYT Civil Rights Coverage ([R code](https://github.com/soodoku/nyt-civil-rights/blob/master/plot.R), [Graph](https://github.com/soodoku/nyt-civil-rights/blob/master/nyt_aa.pdf))
       - Military Experience of UK Prime Ministers ([R code](https://github.com/soodoku/military-experience/blob/master/mil_plots.R), [Graph](https://github.com/soodoku/military-experience/blob/master/ukmil.pdf))
    - [Suggestions for writing](http://gbytes.gsood.com/on-writing/)
 
- 6. Some Applications
+ 8. Some Applications
    - From paper to digital ([presentation](app/PaperToDigital.pdf), [tex](app/PaperToDigital.tex))
    - Text as Data
       - [Sentiment Analysis](https://gist.github.com/soodoku/22e4cff2eb6a05be3c0d)