## A Series is basically a numpy array but with an extra index column attached to it. By default the index is a zero based sequential number

In [None]:
from pandas import Series
data = Series([1,2,3,4])
print(data)
print(data.values)
print(data.index)
print(data[1])


## The index parameter allows you to pass a specific list of indices instead, making the Series sort of like a dictionary object 

In [None]:
data = Series([1,2,3,4], index=['a','b','c','d'])
print(data)
print(data.values)
print(data.index)
print(data['c'])

## A Series can be created by passing a dictionary dictionary keys are used for Series index 

In [None]:
cities = {'Dublin' : 200000, 'Athlone' : 15000, 'Galway' : 700000}
series1 = Series(cities)
print (series1)

## If the index parameter is passed then only those elements from the dictionary that have matching keys are included, and missing ones have NaN values in the Series 

In [None]:
cities = {'Dublin' : 200000, 'Athlone' : 15000, 'Galway' : 700000}
indexes = ['Dublin', 'Athlone', 'Waterford']
series2 = Series(cities, index=indexes)
print (series2)

## isnull() and notnull() return another Series of True or False values depending on whether the value is null or not. Passing that using the [] indexer you can get back just the null or not null values or even replace those values with another

In [None]:
series2 = Series(cities, index=indexes)
print(series2)
print()
print(series2.isnull())
print()
print(series2.notnull())
print()
print(series2[series2.isnull()])
print()
print(series2[series2.notnull()])

series2[series2.isnull()] = 0
print(series2)

## LAB 1: ## 

### The aim of this exercise is to gain some experience of working with the Pandas Series data structure.

#### 1.	Define a Series object holding the values 1 to 10.
#### 2.	Display the data values of the Series object defined in Step 1.
#### 3.	Display the index values of the Series object defined in Step 1.
#### 4.	Define a new Series object holding the values 1 to 10, with the corresponding index values set ‘a’ through to ‘j’.
#### 5.	Display the data values and index of the Series object of Step 4.
#### 6.	Access the third and fifth elements of the Series objects using their index.
#### 7.	Define the following dictionary: {'Dublin': 200000, 'Athlone': 15000, 'Galway': 700000}.
#### 8.	Define the following array: ['Dublin', 'Athlone', 'Waterford'].
#### 9.	Now, construct a Series object using the dictionary in Step 7 and the index in Step 8.
#### 10.	Display the Series object defined in Step 9.
#### 11.	Use the Series notnull() and isnull() methods to display which elements are not null and null, respectively, for the Series object defined in Step 9.

<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use a native Python function instead of enumerating the values 1 - 10 by hand or a numpy function might even be better
<br>
Remember a Python trick that a string can be turned into a list of letters
<br>
Remember numeric indexes are zero based
<br>
<br>
</p>
</details>


<details><summary>Click for <b>code</b></summary>
<p>

```python
import pandas as pd
s1 = pd.Series(range(1,11))
print(s1)
print(s1.index)

import numpy as np
s2 = pd.Series(np.arange(1,11), index=list('abcdefghij'))
print(s2)
print(s2.index)

print(s1[2], s1[4], s2['c'], s2['e'])

d1 = {'Dublin': 200000, 'Athlone': 15000, 'Galway': 700000, 'Belfast':None}
a1 = ['Dublin', 'Athlone', 'Waterford']

s3 = pd.Series(d1, index = a1)
print(s3)

print(s3.isnull(), s3.notnull())
```
</p>
</details>

## You can pass in a dictionary of lists to define a DataFrame in column strips

In [None]:
from pandas import DataFrame
data = {'team' : ['Leicester', 'Manchester City', 'Arsenal'], 
        'player' : ['Vardy', 'Aguero', 'Sanchez'], 
        'goals' : [24,22,19]}
football = DataFrame(data)
print(football)

## Or pass in a list of dictionaries to define the DataFrame by rows

In [None]:
data = [{'team':'Leicester', 'player':'Vardy', 'goals':24}
        ,{'team':'Manchester City', 'player':'Aguero', 'goals':22}
        ,{'team':'Arsenal', 'player':'Sanchez', 'goals':19}]
football = DataFrame(data)
print(football)

## Just like a Series you can supply the index parameter to override the default zero based numeric values

In [None]:
from pandas import DataFrame
data = {'team' : ['Leicester', 'Manchester City', 'Arsenal'], 
        'player' : ['Vardy', 'Aguero', 'Sanchez'], 
        'goals' : [24,22,19]}
football = DataFrame(data, index = ['one', 'two', 'three'])
print(football)

## Just like you can see if a key exists in a dictionary, you can check to see if either a row or column index exists in a DataFrame

In [None]:
print(football)
print('player' in football.columns)
print('three' in football.index)

## LAB 2: ## 

### The aim of this exercise is to gain some experience of working with Pandas DataFrame data structure.

#### 1.	Try to encode the following data into a DataFrame object:

| id | name | age |
| -- | ---- | --- |
| 1 | Jack | 30 |
| 2 | Mary| 40 |
| 3 | Mike | 35 |
| 4 | Susan | 25 |


<br>
<details><summary>Click for <b>hint</b></summary>
<p>
First create either a list of dictionaries or dictionary of lists to encode the data
<br>
You want id to be the index column so exclude that from your dictionary and supply it separately<br>
<br>
<br>
</p>
</details>


<details><summary>Click for <b>code</b></summary>
<p>

```python
import pandas as pd
from pandas import DataFrame

# Dictionary of lists
d = {'name':['Jack', 'Mary', 'Mike', 'Susan'],
     'age' : [30, 40, 35, 25]}

df = DataFrame(d, index = [1, 2, 3, 4])
print(df)

# List of Dictionaries
d = [{'name' : 'Jack', 'age' : 30}
    ,{'name' : 'Mary', 'age' : 40}
    ,{'name' : 'Mike', 'age' : 35}
    ,{'name' : 'Susan', 'age' : 25}
    ]

df = DataFrame(d, index = [1, 2, 3, 4])
print(df)

# Or if you have one big dictionary, you can use list comprehension magic to pull out the id column

d = [{'id' : 1, 'name' : 'Jack', 'age' : 30}
    ,{'id' : 2, 'name' : 'Mary', 'age' : 40}
    ,{'id' : 3, 'name' : 'Mike', 'age' : 35}
    ,{'id' : 4, 'name' : 'Susan', 'age' : 25}
    ]

df = DataFrame(d, index = (x['id'] for x in d), columns = ['name', 'age'])
print(df)

```
</p>
</details>

## Data from Series can be retrieved by index using integers and indexes

In [None]:
import numpy as np
data = Series(np.arange(4.0), index=['a','b','c','d'])
print(data)
print('item 2', data[2], sep = '\n')
print('item b', data['b'], sep = '\n')
print('items 0:2', data[0:2], sep = '\n')
print('items b & d', data[['b','d']], sep = '\n')
print('True/False', data[[True, False, True, False]], sep = '\n')
print('Less than 2', data < 2, sep = '\n')
print('Data Less than 2', data[data<2], sep = '\n')


## The same is true for DataFrames

In [None]:
data = DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['one','two','three'])
print(data)
print('three', data['three'], sep = '\n') # fetchs all rows just column three
print('one & three', data[['one', 'three']], sep = '\n') # note the double brackets

print('But if you give it a range it interprets it as a range of rows not columns')
print(data[0:2])

print(data[[True, False, True]])
print(data['two'] > 1)
print(data[data['two'] > 1])

## All of these fetch the first row and all columns


In [None]:
display(data.ix[0], data.ix['a'], data.loc['a'], data.iloc[0])


## All of these retrieve the second column for all rows


In [None]:
display(data.ix[:,'two'], data.ix[:,1], data.loc[:,'two'], data.iloc[:,1])

## All these would retrieve the first two rows and the second column

In [None]:
display(data.ix[['a','b'],'two'], data.ix[0:2,1], data.loc['a':'c':,'two'], data.iloc[0:2,1])


## Like numpy arrays, Series can be operated on but instead of the array sizes needing to match, it uses the index of the Series to decide what matches with what for the math operation

In [None]:
data1 = Series([1.0, 2.0, 3.0], index=['a','d','e'])
data2 = Series([2.0, 3.0, 4.0, 5.0], index=['a','b','c','e'])
display(data1 + data2)

## DataFrames are just a bunch of Series columns with the same index, so you can also operate on a DataFrame the same way

In [None]:
data1 = DataFrame(np.arange(9.0).reshape((3,3)), columns=list('abc'), index=['one','two','three'])
data2 = DataFrame(np.arange(12.0).reshape((4,3)), columns=list('ace'), index=['one','two','three','four'])
display(data1 + data2)

In [None]:
display(data1)
display(data2)
display(data1.add(data2, fill_value=0))

## You can apply a function to each element at a time with the apply function, similar to the Python map function

In [None]:
display(data1.apply(lambda x : x * 2))


## Using the mean function it would apply it to each column instead of each element

In [None]:
display(data1.apply(np.mean))

## That would be the same as supplying the axis = 0 parameter

In [None]:
display(data1.apply(np.mean, axis = 0))

## Axis 1 would yield the row mean instead

In [None]:
display(data1.apply(np.mean, axis = 1))

## For built in functions like mean, sum, etc. we could just directly call them, but if you needed to make a custom function, then you'd need apply

In [None]:
display(data1.mean())
display(data1.mean(axis = 1))

display(data1.apply(lambda x : x.max() ** 2  - x.min() ** 2))


## You can sort by the row index in ascending or descending order

In [None]:
print(data1.sort_index(ascending=True))
print(data1.sort_index(ascending=False))

## You can sort by a column value or multiple columns

In [None]:
data1 = DataFrame(np.arange(9.0).reshape((3,3)), columns=list('abc'), index=['one','two','three'])
data1.loc['three','b'] = 1
data1.loc['three','c'] = 1

display(data1.sort_values(by='b', ascending=True))
display(data1.sort_values(by=['b', 'c'], ascending=True))
display(data1.sort_values(by=['b', 'c'], ascending=[True, False]))


In [None]:
data2 = DataFrame({'b':[1,4,3,2], 'a':[6,9,20,3], 'c':[7,2,8,15]})
display(data2.rank())
display(data2.rank(axis = 1))


## You can read from a file and there are a ton of different parameters to experiment with to get it to read just right

In [None]:
display(pd.read_csv('sample.csv'))
display(pd.read_csv('sample.csv', header = 0))
display(pd.read_csv('sample.csv', header = None))
display(pd.read_csv('sample.csv', header = None, index_col = 0))
display(pd.read_csv('sample.csv', header = None, index_col = 0, names = ['one', 'two', 'three', 'four']))


In [None]:
import json
data = json.loads(open('example.json').read())
print(data)
customers = DataFrame(data['customers'])
display(customers)

## You can also read from SQL tables, you just need the correct library for the version of SQL you want to connect to

In [None]:
import sqlite3
cn = sqlite3.connect('test.sqlite')
curs = cn.cursor()
curs.execute("create table names (id int, name varchar(20))")
curs.execute("insert into names values(1, 'Alice'), (2, 'Bob')")
cn.commit()
curs.execute("select * from names")
names = curs.fetchall()
print(names)
names2 = pd.read_sql_query("select * from names", cn)
display(names2)
cn.close()


In [None]:
data = DataFrame([[1,np.nan],[3,4],[5,np.nan]], columns=['a','b'])
print(data.sum())
print(data.sum(axis = 1))
display(data.describe())

## Pandas_datareader is a downloadable package that can fetch stock information

In [None]:
# ! pip install pandas_datareader
import pandas_datareader.data as web

# Python code to get the date 90 days ago
import datetime 
tod = datetime.datetime.now()
d = datetime.timedelta(days = 90)
a = tod - d
#print(a)
        
        
display(web.get_data_yahoo('AAPL', start = a)) 

# List comprehension to get several stocks as a dictionary, so we can turn them into a DataFrame
all_data = {ticker: web.get_data_yahoo(ticker, start = a) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}


In [None]:
stock_close = DataFrame({ticker:data['Adj Close'] for ticker, data in all_data.items()})
display(stock_close[:5])
display(stock_close.pct_change()[:5])

In [None]:
display(stock_close.corr())
display(stock_close.cov())

## Handling missing data

In [None]:
from numpy import nan as NA
data = Series([1,NA,2,3,4,NA])
display(data)
data1 = data.dropna()
display(data1)
display(data)
data.dropna(inplace = True)
display(data)

## Works the same for DataFrames, but with some additional options

In [None]:
data = DataFrame ([[1,2,3],[NA,5,NA],[NA,NA,NA],[10,11,12]])
display(data)

display(data.dropna(how = 'all')) # works by row and only drops the row if all columns are NaN
display(data.dropna(how = 'any')) # works by row and only drops the row if any columns are NaN

data1 = DataFrame ([[1,2,NA],[NA,5,NA],[NA,12,NA],[10,11,NA]])
display(data1)
display(data1.dropna(how = 'all', axis = 1)) # works by col and only drops the col if all rows are NaN
display(data1.dropna(how = 'any', axis = 1)) # works by col and only drops the col if any rows are NaN


## Instead of removing missing data you may wish to replace it with another values

In [None]:
display(data)
filled = data.fillna(0)
display(filled)
filled = data.fillna({0:10, 1:20})
display(filled)
filled = data.fillna(data.mean())
display(filled)



## Replacings the Nulls with the row mean is a little trickier, but transposing works to flip to rows and columns so you can calculate the row means as if they were column means

In [None]:
data = DataFrame ([[1,2,3],[4,5,NA],[6,NA,NA],[10,NA,12]])
display(data)
filled = data.T.fillna(data.T.mean()).T
#filled = data.fillna(data.mean(axis = 1))
display(filled)


## Homework: ## 
#### 1.	Read the file categories.csv. 
#### 2.	Use the first column as the index column for the DataFrame
#### 3.	Print the DataFrame, and the first two elements only of the DataFrame
#### 4. Load the products.json file
#### 5.	Display only the products in category 1
#### 6. Display the products from highest to lowest price
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Check the parameters for read_csv to make sure you read the column names right and use the proper column for the index column
<br>
Remember there are many functions to pull a slice from a DataFrame so experiment until you find the right one
<br>
The syntax to filter on a condition is odd, but it's in the slides
<br>
There are several functions to sort, so choose the right one to sort on price
<br>
<br>
</p>
</details>

