forked from nltk/nltk_book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
advanced.tex
216 lines (180 loc) · 5.51 KB
/
advanced.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
\documentclass{beamer} % for slides
% \documentclass[handout]{beamer} % for handout
\input{beamer}
\title{Advanced Programming in Python}
\author{Steven Bird \and Ewan Klein \and Edward Loper}
\institute{
University of Melbourne, AUSTRALIA
\and
University of Edinburgh, UK
\and
University of Pennsylvania, USA
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\section{Regular Expressions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Program Development}
\begin{frame}
\frametitle{Program Development}
\begin{itemize}
\item High-level Design
\item Algorithm Design
\item Structured Programming
\item Debugging
\item Performance Tuning
\end{itemize}
\end{frame}
\subsection{Defining Functions and Modules}
\begin{frame}[fragile]
\frametitle{Defining Functions}
\scriptsize
\begin{itemize}
\item part of a program needs to be used several times over
\item e.g. form plural of singular noun, to be done in several places
in a program
\item localize this work inside a \textit{function}
\item also helps readability, reusability
\begin{verbatim}
>>> def plural(word):
... if word[-1] == 'y':
... return word[:-1] + 'ies'
... elif word[-1] in 'sx':
... return word + 'es'
... elif word[-2:] in ['sh', 'ch']:
... return word + 'es'
... elif word[-2:] == 'an':
... return word[:-2] + 'en'
... return word + 's'
>>> plural('fairy')
'fairies'
>>> plural('woman')
'women'
\end{verbatim}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Defining Modules}
\begin{verbatim}
def parsed(files):
for file in files:
path = os.path.join(get_basedir(), "treebank", file)
s = open(path).read()
for t in tokenize.blankline(s):
yield tree.bracket_parse(t)
\end{verbatim}
\end{frame}
\subsection{Algorithm Design}
\begin{frame}
\frametitle{Algorithm Design}
\small
\begin{itemize}
\item \textit{algorithm}: a "recipe" for solving a problem
\item e.g. to multiply 16 by 12 we might use any of the following methods:
\begin{enumerate}
\item Add 16 to itself 12 times over
\item Perform "long multiplication", starting with the least-significant
digits of both numbers
\item Look up a multiplication table
\item Repeatedly halve the first number and double the second,
16*12 = 8*24 = 4*48 = 2*96 = 192
\item Do 10*12 to get 120, then add 6*12
\end{enumerate}
\item computation time, intermediate storage
\item brute-force, divide-and-conquer, dynamic programming, greedy search
\item Textbook: Levitin, Anany (2003) \textit{Introduction to the Design and
Analysis of Algorithms}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Sorting Algorithms}
\begin{itemize}
\item Many algorithms for sorting
\item Illustrates the difference in algorithm complexity
\begin{verbatim}
>>> from random import shuffle
>>> from nltk.misc import sort
>>> a = range(1000)
>>> shuffle(a); sort.bubble(a)
250918
>>> shuffle(a); sort.merge(a)
6175
>>> shuffle(a); sort.quick(a)
2378
\end{verbatim}
\end{itemize}
\end{frame}
\subsection{Top-Down Design}
\begin{frame}
\frametitle{Top-Down Design}
\begin{itemize}
\item Break down high-level task into manageable components
\item Build and test each component
\item Assemble them into a complex system
\item Example: identify adjective-noun collocations in text
\begin{enumerate}
\item read in the corpus
\item count events (each adjective, noun, adj-noun combination)
\item compute $\chi^2$ statistics
\item sort adj-noun pairs in decreasing order
\item output $n$-most significant collocations
\end{enumerate}
\item what are the inputs and outputs of each step (i.e. \textit{interfaces})
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Top-Down Design (cont)}
\begin{enumerate}
\item Define top-level function:
\begin{verbatim}
def colloc(corpus_name, num):
corpus = load_corpus(corpus_name)
a_counts, n_counts, a_n_counts = count(corpus, 'JJ', 'NN')
a_n_scores = chisq(a_counts, n_counts, a_n_counts)
a_n_ranked = sort_by_score(a_n_scores)
return a_n_ranked[:num]
\end{verbatim}
\item Iteratively define lower-level functions, e.g. \texttt{chisq()}
\item Bottom-up testing
\begin{itemize}
\item standard test cases for components, known output
\item only assemble once each component is known to work correctly
on the test cases
\end{itemize}
\end{enumerate}
\end{frame}
\subsection{Debugging}
\begin{frame}
\frametitle{Debugging}
\begin{itemize}
\item What's in the name:
\begin{itemize}
\item problems are small relative to their impact
\item hard-to-find
\item seem to take on a life of their own as the programmer tries to
hunt them down
\end{itemize}
\item steps:
\begin{itemize}
\item isolate the problem
\item syntax error: see error messages, linked to line numbers
\item run-time error: \textit{stack-trace}
\item add diagnostic print statements
\item display values of variables just before problem
\item Python debugger: \texttt{pdb}
\end{itemize}
\end{itemize}
\end{frame}
\subsection{Next Steps...}
\begin{frame}
\frametitle{Next Steps: Learning to Program}
\begin{itemize}
\item experimental
\item Python's \texttt{help} and \texttt{dir} commands
\item tutorial exercises
\end{itemize}
\end{frame}