# Configuring pandas

In [2]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# The pandas Series

In [3]:
# create a four item Series
s = pd.Series([1, 2, 3, 4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
# get value at label 1
s[1]

2

In [5]:
# return a Series with the row with labels 1 and 3
s[[1, 3]]

1    2
3    4
dtype: int64

In [6]:
# create a series using an explicit index
s = pd.Series([1, 2, 3, 4], 
               index = ['a', 'b', 'c', 'd'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [7]:
# look up items the series having index 'a' and 'd'
s[['a', 'd']]

a    1
d    4
dtype: int64

In [8]:
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 2]]

b    2
c    3
dtype: int64

In [None]:
# get only the index of the Series
s.index

In [None]:
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates = pd.date_range('2016-04-01', '2016-04-06')
dates

In [None]:
# create a Series with values (representing temperatures)
# for each date in the index
temps1 = pd.Series([80, 82, 85, 90, 83, 87], 
                   index = dates)
temps1

In [None]:
# what's the temperation for 2016-4-4?
temps1['2016-04-04']

In [None]:
# create a second series of values using the same index
temps2 = pd.Series([70, 75, 69, 83, 79, 77], 
                   index = dates)
# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
temp_diffs

In [None]:
# and also possible by integer position as if the 
# series was an array
temp_diffs[2]

In [None]:
# calculate the mean of the values in the Series
temp_diffs.mean()

# The pandas DataFrame

In [None]:
# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = pd.DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
temps_df

In [None]:
# get the column with the name Missoula
temps_df['Missoula']

In [None]:
# likewise we can get just the Philadelphia column
temps_df['Philadelphia']

In [None]:
# return both columns in a different order
temps_df[['Philadelphia', 'Missoula']]

In [None]:
# retrieve the Missoula column through property syntax
temps_df.Missoula

In [None]:
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia

In [None]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df

In [None]:
# get the columns, which is also an Index object
temps_df.columns

In [None]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:4]

In [None]:
# get the row at array position 1
temps_df.iloc[1]

In [None]:
# the names of the columns have become the index
# they have been 'pivoted'
temps_df.iloc[1].index

In [None]:
# retrieve row by index label using .loc
temps_df.loc['2016-04-05']

In [None]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

In [None]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

In [None]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

# Loading data from a CSV file into a DataFrame

In [None]:
# display the contents of test1.csv
# which command to use depends on your OS
!head data/goog.csv # on non-windows systems
#!type data/test1.csv # on windows systems, all lines

In [None]:
# read the contents of the file into a DataFrame
df = pd.read_csv('data/goog.csv')
df

In [None]:
# the contents of the date column
df.Date

In [None]:
# we can get the first value in the date column
df.Date[0]

In [None]:
# it is a string
type(df.Date[0])

In [None]:
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
df = pd.read_csv('data/goog.csv', parse_dates=['Date'])
df

In [None]:
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.Date[0])

In [None]:
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index

In [None]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('data/goog.csv', 
                 parse_dates=['Date'], 
                 index_col='Date')
df

In [None]:
# and the index is now a DatetimeIndex
df.index

# Visualization

In [None]:
# plots the values in the Close column
df.Close.plot();