-
Notifications
You must be signed in to change notification settings - Fork 13
/
CosSimDef.sas
145 lines (126 loc) · 4.4 KB
/
CosSimDef.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SAS program to accompany the article
"Cosine similarity of vectors"
by Rick Wicklin, published 03SEP2019 on The DO Loop blog:
https://blogs.sas.com/content/iml/2019/09/03/cosine-similarity.html
This program shows how to compute the cosine similarity in SAS.
You can
1. Use PROC DISTANCE to compute the cosine similarities of rows
2. Use PROC IML to compute the cosine similarity of rows or columns.
PROC IML also provides an easy way to create a heat map that visualizes
the cosine similarity matrix.
*/
/********************************************/
/* Very simple 2-D example */
/********************************************/
data Vectors;
length Name $1;
input Name x y;
datalines;
A 0.5 1
B 3 5
C 3 2.8
D 5 1
;
/* plot the vectors */
ods graphics / width=400px height=400px;
title "Four Row Vectors";
proc sgplot data=Vectors aspect=1;
vector x=x y=y / datalabel=Name datalabelattrs=(size=14);
xaxis grid;
yaxis grid;
run;
/*
proc distance data=Vectors out=Dist method=EUCLID shape=square;
var ratio(_NUMERIC_);
id Name;
run;
proc print data=Dist noobs; run;
*/
proc distance data=Vectors out=Cos method=COSINE shape=square;
var ratio(_NUMERIC_);
id Name;
run;
proc print data=Cos noobs; run;
/* you can also compute the cosine similarity by using SAS/IML */
proc iml;
/* complete cases:
https://blogs.sas.com/content/iml/2015/02/23/complete-cases.html
exclude any row with a missing value */
start ExtractCompleteCases(X);
if all(X ^= .) then return X;
idx = loc(countmiss(X, "row")=0);
if ncol(idx)>0 then return( X[idx, ] );
else return( {} );
finish;
/* cosine similarity of variables */
start CosSimCols(X, checkForMissing=1);
if checkForMissing then do; /* by default, check for missing and exclude */
Z = ExtractCompleteCases(X);
Y = Z / sqrt(Z[##,]); /* stdize each column */
end;
else Y = X / sqrt(X[##,]); /* skip check if you know all values are valid */
cosY = Y` * Y; /* pairwise inner products */
/* because of finite precision, elements could be 1+eps or -1-eps */
idx = loc(cosY> 1); if ncol(idx)>0 then cosY[idx]= 1;
idx = loc(cosY<-1); if ncol(idx)>0 then cosY[idx]=-1;
return cosY;
finish;
/* cosine similarity of observations */
start CosSimRows(X);
Z = ExtractCompleteCases(X); /* check for missing and exclude */
return T(CosSimCols(Z`, 0)); /* transpose and call CosSimCols */
finish;
store module=(ExtractCompleteCases CosSimCols CosSimRows);
/* test the computations on the simple data */
use Vectors; read all var _NUM_ into X[r=Name]; close;
cosY = CosSimRows(X);
print cosY[r=Name c=Name format=7.5];
/* If you want the actual angles, apply the inverse cosine function */
/*
deg = arcos(cosY)*180/constant('pi');
reset fuzz; print deg[f=5.2];
*/
/* plot the standarized vectors */
Y = X / sqrt(X[,##]); /* stdize each row */
create StdVectors from Y[r=Name c={'x' 'y'}];
append from Y[r=Name];
close;
QUIT;
title "Standardized Vectors";
proc sgplot data=StdVectors aspect=1;
vector x=x y=y / datalabel=Name datalabelattrs=(size=14);
xaxis grid min=0 max=1;
yaxis grid min=0 max=1;
run;
/***************************************************/
/* second example, vehicles with very large/small horsepower */
data Vehicles;
set sashelp.cars(where=(Origin='USA'));
if Horsepower < 140 OR Horsepower >= 310;
run;
proc sort data=Vehicles; by Type; run;
proc means data=Vehicles;run;
ods graphics / reset;
proc iml;
load module=(CosSimCols CosSimRows);
use Vehicles;
read all var _NUM_ into X[c=varNames r=Model];
read all var {Model Type}; close;
labl = compress(Type + ":" + substr(Model, 1, 10));
cosRY = CosSimRows(X);
*call heatmapcont(cosRY) xvalues=labl yvalues=labl
title="Cosine Similarity between Vehicles";
cosCY = CosSimCols(X);
call heatmapcont(cosCY) xvalues=varNames yvalues=varNames
title="Cosine of Angle Between Variables";
/* If desired, you can compare cosine similarity for correlation.
For these data, the cosine similarity is often in the range [0.8, 1],
The real advantage of cosine similarity is when the data have a lot
of zeros, because then the inner product between two vectors
only contains the product of the mutually nonzero elements.
*/
corr=corr(X);
call heatmapcont(corr) xvalues=varNames yvalues=varNames
title="Correlation Between Variables";
QUIT;
/****************************/