### INTRODUCTION TO PYTHON - PART V
### This tutorial was developed as a part of the CS Outreach Professional workshop series.If you have any comments on this file,please feel free to mail me at rxj133030@utdallas.edu

### INTRODUCTION TO NUMPY
### Numpy is the core library for scientific computing in Python

In [5]:
import numpy as np

a = np.array([1, 2, 3])  # Create a rank 1 array
print type(a)            # Prints "<type 'numpy.ndarray'>"
#print a
print a.shape            # Prints "(3,)"
print a[0], a[1], a[2]   # Prints "1 2 3"
a[0] = 5                 # Change an element of the array
print a                  # Prints "[5, 2, 3]"

b = np.array([[1,2,3],[4,5,6]])   # Create a rank 2 array
print b.shape                     # Prints "(2, 3)"
print b[0, 0], b[0, 1], b[1, 0]   # Prints "1 2 4"
print b

<type 'numpy.ndarray'>
(3L,)
1 2 3
[5 2 3]
(2L, 3L)
1 2 4
[[1 2 3]
 [4 5 6]]


In [12]:
print len(b.shape)   # Two dimensional 2
print len(a.shape)   # One Dimensional 1 

2
1


In [21]:
a = np.zeros((2,2))  # Create an array of all zeros
print a       

a = np.full((2,2),10,dtype=int)  # Create an array of all "10"
# dtype can be changed int64,float64
print a       

a = np.full((2,2),10,dtype=np.float64)  # Create an array of all "10"
print a

[[ 0.  0.]
 [ 0.  0.]]
[[10 10]
 [10 10]]
[[ 10.  10.]
 [ 10.  10.]]


### Slicing and Indexing Numpy

In [18]:
# Lets keep it simple do not go for complex subsetting
a = np.array([[0,1,2,3], [4,5,6,7], [8,9,10,11]])
b = a[:2, 1:3]
print a
print b

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[1 2]
 [5 6]]


In [19]:
a = np.array([[1,2], [3, 4], [5, 6]])
print a
print a[[0, 1, 2], [0, 1, 0]]  # Prints "[1 4 5]"

[[1 2]
 [3 4]
 [5 6]]
[1 4 5]


In [22]:
#Generating Indices for parsing through the matrix
print np.arange(5)

[0 1 2 3 4]


In [27]:
# Vector wise operations
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)
print x
print y

print x+y
print x*y

# Transpose
print x.T

[[ 1.  2.]
 [ 3.  4.]]
[[ 5.  6.]
 [ 7.  8.]]
[[  6.   8.]
 [ 10.  12.]]
[[  5.  12.]
 [ 21.  32.]]
[[ 1.  3.]
 [ 2.  4.]]


### Introduction to Pandas 

In [28]:
import pandas as pd
import matplotlib.pyplot as plt

s = pd.Series(["a",2,7,np.nan,6,8,69.5])
print s


0       a
1       2
2       7
3     NaN
4       6
5       8
6    69.5
dtype: object


In [37]:
dates = pd.date_range(start='20130101',periods=6)
print dates

dates = pd.date_range(start='20130101',end='20130201')
print dates

dates = pd.date_range(start='20130101',end='20140201',freq='BM')
print dates

dates = pd.date_range(start='20130101',periods=6)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13', '2013-01-14', '2013-01-15', '2013-01-16',
               '2013-01-17', '2013-01-18', '2013-01-19', '2013-01-20',
               '2013-01-21', '2013-01-22', '2013-01-23', '2013-01-24',
               '2013-01-25', '2013-01-26', '2013-01-27', '2013-01-28',
               '2013-01-29', '2013-01-30', '2013-01-31', '2013-02-01'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2013-01-31', '2013-02-28', '2013-03-29', '2013-04-30',
               '2013-05-31', '2013-06-28', '2013-07-31', '2013-08-30',
               '2013-09-30', '2013-10-31', '2013-11-29', '2013-12-31',
       

### Creating a dataframe

In [42]:
df = pd.DataFrame(np.zeros((6,4)), index=dates, columns=list('ABCD'))
print df

# Just fill some random values
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print df

            A  B  C  D
2013-01-01  0  0  0  0
2013-01-02  0  0  0  0
2013-01-03  0  0  0  0
2013-01-04  0  0  0  0
2013-01-05  0  0  0  0
2013-01-06  0  0  0  0
                   A         B         C         D
2013-01-01 -0.000820  0.479773  0.877851 -0.334130
2013-01-02  1.708779 -0.087880  1.480129 -1.524267
2013-01-03  1.228967  0.339984 -1.333193 -0.801494
2013-01-04  2.440231  1.988558  0.047182  0.462218
2013-01-05  0.279414 -0.836278  1.489490  0.522864
2013-01-06 -0.065179 -0.326279  1.378917  0.688227


In [44]:
# Understand the types
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [46]:
# press tab to check the attributes of the dataframe

# Viewing data
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.00082,0.479773,0.877851,-0.33413
2013-01-02,1.708779,-0.08788,1.480129,-1.524267
2013-01-03,1.228967,0.339984,-1.333193,-0.801494
2013-01-04,2.440231,1.988558,0.047182,0.462218
2013-01-05,0.279414,-0.836278,1.48949,0.522864


In [47]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,2.440231,1.988558,0.047182,0.462218
2013-01-05,0.279414,-0.836278,1.48949,0.522864
2013-01-06,-0.065179,-0.326279,1.378917,0.688227


In [48]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [49]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [50]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.931899,0.259646,0.656729,-0.16443
std,1.025351,0.970482,1.120113,0.880442
min,-0.065179,-0.836278,-1.333193,-1.524267
25%,0.069238,-0.266679,0.254849,-0.684653
50%,0.75419,0.126052,1.128384,0.064044
75%,1.588826,0.444826,1.454826,0.507703
max,2.440231,1.988558,1.48949,0.688227


In [51]:
# Taking transpose
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.00082,1.708779,1.228967,2.440231,0.279414,-0.065179
B,0.479773,-0.08788,0.339984,1.988558,-0.836278,-0.326279
C,0.877851,1.480129,-1.333193,0.047182,1.48949,1.378917
D,-0.33413,-1.524267,-0.801494,0.462218,0.522864,0.688227


In [53]:
# Sorting by index
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.33413,0.877851,0.479773,-0.00082
2013-01-02,-1.524267,1.480129,-0.08788,1.708779
2013-01-03,-0.801494,-1.333193,0.339984,1.228967
2013-01-04,0.462218,0.047182,1.988558,2.440231
2013-01-05,0.522864,1.48949,-0.836278,0.279414
2013-01-06,0.688227,1.378917,-0.326279,-0.065179


In [54]:
# Sorting by columns
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,0.279414,-0.836278,1.48949,0.522864
2013-01-06,-0.065179,-0.326279,1.378917,0.688227
2013-01-02,1.708779,-0.08788,1.480129,-1.524267
2013-01-03,1.228967,0.339984,-1.333193,-0.801494
2013-01-01,-0.00082,0.479773,0.877851,-0.33413
2013-01-04,2.440231,1.988558,0.047182,0.462218


In [None]:
### Selection Operator

In [56]:
# Column Selection
df['A']

2013-01-01   -0.000820
2013-01-02    1.708779
2013-01-03    1.228967
2013-01-04    2.440231
2013-01-05    0.279414
2013-01-06   -0.065179
Freq: D, Name: A, dtype: float64

In [57]:
# Row wise Selection
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,-0.00082,0.479773,0.877851,-0.33413
2013-01-02,1.708779,-0.08788,1.480129,-1.524267


In [58]:
df['20130101':'2013-01-02']

Unnamed: 0,A,B,C,D
2013-01-01,-0.00082,0.479773,0.877851,-0.33413
2013-01-02,1.708779,-0.08788,1.480129,-1.524267


In [59]:
print type(df.index)

<class 'pandas.tseries.index.DatetimeIndex'>


### Selection more explained

In [61]:
# Column Names
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.00082,0.479773
2013-01-02,1.708779,-0.08788
2013-01-03,1.228967,0.339984
2013-01-04,2.440231,1.988558
2013-01-05,0.279414,-0.836278
2013-01-06,-0.065179,-0.326279


In [65]:
# Most Widely access methodology
# Subset selection only through indices
print df
df.iloc[1:3,1:3]

                   A         B         C         D
2013-01-01 -0.000820  0.479773  0.877851 -0.334130
2013-01-02  1.708779 -0.087880  1.480129 -1.524267
2013-01-03  1.228967  0.339984 -1.333193 -0.801494
2013-01-04  2.440231  1.988558  0.047182  0.462218
2013-01-05  0.279414 -0.836278  1.489490  0.522864
2013-01-06 -0.065179 -0.326279  1.378917  0.688227


Unnamed: 0,B,C
2013-01-02,-0.08788,1.480129
2013-01-03,0.339984,-1.333193


In [66]:
# Subsetting by value
df[df.A>0]

Unnamed: 0,A,B,C,D
2013-01-02,1.708779,-0.08788,1.480129,-1.524267
2013-01-03,1.228967,0.339984,-1.333193,-0.801494
2013-01-04,2.440231,1.988558,0.047182,0.462218
2013-01-05,0.279414,-0.836278,1.48949,0.522864


### Operations

In [68]:
# Statistics
print df.mean()
df.mean(1)

A    0.931899
B    0.259646
C    0.656729
D   -0.164430
dtype: float64


2013-01-01    0.255668
2013-01-02    0.394190
2013-01-03   -0.141434
2013-01-04    1.234547
2013-01-05    0.363872
2013-01-06    0.418921
Freq: D, dtype: float64

In [69]:
df.apply(lambda x: x.max() - x.min())

A    2.505410
B    2.824836
C    2.822683
D    2.212493
dtype: float64

In [71]:
np.random.seed(1)
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()

0    3
5    2
3    2
1    2
4    1
dtype: int64

###  Merging DataFrames

In [72]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.528172,-1.072969,0.865408,-2.301539
1,1.744812,-0.761207,0.319039,-0.24937
2,1.462108,-2.060141,-0.322417,-0.384054
3,1.133769,-1.099891,-0.172428,-0.877858
4,0.042214,0.582815,-1.100619,1.144724
5,0.901591,0.502494,0.900856,-0.683728
6,-0.12289,-0.935769,-0.267888,0.530355
7,-0.691661,-0.396754,-0.687173,-0.845206
8,-0.671246,-0.012665,-1.11731,0.234416
9,1.659802,0.742044,-0.191836,-0.887629


In [74]:
pieces = [df[:2], df[2:5], df[5:]]
pieces
# A list of dataframe subsets

[          0         1         2         3
 0 -0.528172 -1.072969  0.865408 -2.301539
 1  1.744812 -0.761207  0.319039 -0.249370
 2  1.462108 -2.060141 -0.322417 -0.384054,
           0         1         2         3
 3  1.133769 -1.099891 -0.172428 -0.877858
 4  0.042214  0.582815 -1.100619  1.144724
 5  0.901591  0.502494  0.900856 -0.683728
 6 -0.122890 -0.935769 -0.267888  0.530355,
           0         1         2         3
 7 -0.691661 -0.396754 -0.687173 -0.845206
 8 -0.671246 -0.012665 -1.117310  0.234416
 9  1.659802  0.742044 -0.191836 -0.887629]

In [75]:
print type(pieces)

<type 'list'>


In [76]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.528172,-1.072969,0.865408,-2.301539
1,1.744812,-0.761207,0.319039,-0.24937
2,1.462108,-2.060141,-0.322417,-0.384054
3,1.133769,-1.099891,-0.172428,-0.877858
4,0.042214,0.582815,-1.100619,1.144724
5,0.901591,0.502494,0.900856,-0.683728
6,-0.12289,-0.935769,-0.267888,0.530355
7,-0.691661,-0.396754,-0.687173,-0.845206
8,-0.671246,-0.012665,-1.11731,0.234416
9,1.659802,0.742044,-0.191836,-0.887629


#### Join Operation

In [78]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print left
print right
pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  foo     2
   key  rval
0  foo     4
1  foo     5


Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [79]:
###  Append OPeration
s=df.iloc[1]
df.append(s,ignore_index=True)

Unnamed: 0,0,1,2,3
0,-0.528172,-1.072969,0.865408,-2.301539
1,1.744812,-0.761207,0.319039,-0.24937
2,1.462108,-2.060141,-0.322417,-0.384054
3,1.133769,-1.099891,-0.172428,-0.877858
4,0.042214,0.582815,-1.100619,1.144724
5,0.901591,0.502494,0.900856,-0.683728
6,-0.12289,-0.935769,-0.267888,0.530355
7,-0.691661,-0.396754,-0.687173,-0.845206
8,-0.671246,-0.012665,-1.11731,0.234416
9,1.659802,0.742044,-0.191836,-0.887629


### Group by

In [82]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
 'C' : np.random.randn(8),
 'D' : np.random.randn(8)})

In [83]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.285587,1.131629
1,bar,one,0.885141,1.519817
2,foo,two,-0.754398,2.185575
3,bar,three,1.252868,-1.396496
4,foo,two,0.51293,-1.444114
5,bar,two,-0.298093,-0.504466
6,foo,one,0.488518,0.160037
7,foo,three,-0.075572,0.876169


In [84]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.839916,-0.381145
foo,0.457066,2.909297


In [85]:
df.groupby('A').mean()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.613305,-0.127048
foo,0.091413,0.581859


In [86]:
# Converting to categorical variables
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

###   Advanced Topics : Reshape
###   Pivotal Tables