In [1]:
# Imports: 
import numpy as np
import pandas as pd
from numpy.random import randn

In [2]:
# Just sets all of the random numbers to a seed so that I have the same random numbers as the tutorial
np.random.seed(101) 

In [3]:
""" 
    Dummy Data Frame
    They are essentially a fancy version of numpy arrays
    and they will have all of the numpy array features 
"""
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W',  'X', 'Y', 'Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [4]:
# You can get a column in both of these ways
df['W']
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [5]:
""" 
    Another way to define a dataframe is to say it is a collection of numpy arrays
    that share the same index. And there fore when you check they type of one
    column it will return padas.core.series.Series ( pandas is built of of numpy 
    there fore they are both called series )
"""
type(df['W'])

pandas.core.series.Series

In [6]:
# We can also retrive multiple columns
df[['W', 'Z']]  

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [7]:
# You can add columns like df['new'], you can also add to columns together
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [8]:
"""
    To delete you must use df.drop, however you must refer the
    column and also the axis and also inplace True (inline true)
    is for column. Columns have axis 1 and rows have axis 0
"""
df.drop('new', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [9]:
# You can also do it with rows
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [10]:
"""
    To show why the columns are 1 and rows are two
    run df.shape you will get a tuple just like a numpy 
    array. (rows, columns) This shows that rows are at the
    0th place and columns are at the 1st place
"""
df.shape

(5, 4)

In [11]:
# There are two ways to display rows
# Rows are also series as far as they way you get them after you request them.
df.loc['A']
# You can use I loc to use the index
df.iloc[0]


W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [14]:
 # Just like numpy you accsess the value (row first) using a coordinate
df.loc['B', 'Y']

-0.8480769834036315

In [16]:
# You can also get subsets
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [17]:
# You can also get things by using conditional selection
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [18]:
# Just setting what is above in a variable
booldf = df > 0

In [19]:
df[booldf]
# this is useful because it will only keep the values that are true and turn the ones that aren't to nan

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
# This will only return values that are true
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [22]:
df[df['W'] > 0] # What this  does is it will get rid of c because it less than 0 on the w column

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [23]:
# Grab all rows where c is less than 0
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [24]:
# You can also use multiple conditions to filter throughout a dataframe
df[(df['W'] > 0) &  (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [25]:
# You can also use multiple conditions with an or to filter throughout a dataframe
df[(df['W'] > 0) |  (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
# You can reset the index
df.reset_index() # For this to permentatly occur youmust pass  inplace true

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [28]:
# Making a list for all these
newind = 'CA NY WY OR CO'.split()
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [29]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [30]:
# If I wanted states to be the index then I can use set index
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509
