## A Series is basically a numpy array but with an extra index column attached to it. By default the index is a zero based sequential number

In [32]:
import numpy as np
from pandas import Series
data = Series([10,20,30,40])
print(data)
print(type(data.values))
print(data.values)
print(data.index)
print(data[1])

data = Series([10,20,30,40], index=[11,44,55,2])
print(data)
print(data[1])



0    10
1    20
2    30
3    40
dtype: int64
<class 'numpy.ndarray'>
[10 20 30 40]
RangeIndex(start=0, stop=4, step=1)
20
11    10
44    20
55    30
2     40
dtype: int64


KeyError: 1

## The index parameter allows you to pass a specific list of indices instead, making the Series sort of like a dictionary object 

In [12]:
# print(list('abcd'))
# #['ab', 'cd', 'de']
# print('ab,cd,ef'.split(','))

#help(Series)
data = Series([1,2,3,4], index=['a','b','c','d'])
d = dict(zip( ['a','b','c','d'],[1,2,3,4]))
print(data)
# print(data.values)
# print(data.index)
print(data['c'])
print(d['c'])

['a', 'b', 'c', 'd']
['ab', 'cd', 'ef']
a    1
b    2
c    3
d    4
dtype: int64
3
3


## A Series can be created by passing a dictionary dictionary keys are used for Series index 

In [14]:
cities = {'Dublin' : 200000, 'Athlone' : 15000, 'Galway' : 700000}
series1 = Series(cities)
print (series1)

print(dict(series1))

Dublin     200000
Athlone     15000
Galway     700000
dtype: int64
{'Dublin': 200000, 'Athlone': 15000, 'Galway': 700000}


## If the index parameter is passed then only those elements from the dictionary that have matching keys are included, and missing ones have NaN values in the Series 

In [53]:
cities = {'Dublin' : 200000, 'Athlone' : 15000, 'Galway' : 700000}
indexes = ['Dublin', 'Athlone', 'Waterford']
series2 = Series(cities, index=indexes)
print (series2)
print(series2['Waterford'])
print(series2[2])
print(series2.isnull())
print(series2.notnull())

#from numpy import NaN
# print(dir(np))

# print(np.NAN is np.nan)
# from numpy import nan as NA
# from numpy import nan as na
print(series2[[True, False, True]])
print(series2[series2.notnull()])

x = [True, False, True]
#x = ['Dublin', 'Athlone']
x = series2.notnull()
print(series2[x])

print('***')
print(series2[['Dublin']])
print(series2['Dublin'])
print(series2[[0, 2]])

print(np.array(series2[[0, 2]]))


Dublin       200000.0
Athlone       15000.0
Waterford         NaN
dtype: float64
nan
nan
Dublin       False
Athlone      False
Waterford     True
dtype: bool
Dublin        True
Athlone       True
Waterford    False
dtype: bool
Dublin       200000.0
Waterford         NaN
dtype: float64
Dublin     200000.0
Athlone     15000.0
dtype: float64
Dublin     200000.0
Athlone     15000.0
dtype: float64
***
Dublin    200000.0
dtype: float64
200000.0
Dublin       200000.0
Waterford         NaN
dtype: float64
[200000.     nan]


## isnull() and notnull() return another Series of True or False values depending on whether the value is null or not. Passing that using the [] indexer you can get back just the null or not null values or even replace those values with another

In [None]:
series2 = Series(cities, index=indexes)
print(series2)
print()
print(series2.isnull())
print()
print(series2.notnull())
print()
print(series2[series2.isnull()])
print()
print(series2[series2.notnull()])

series2[series2.isnull()] = 0
print(series2)

## LAB 1: ## 

### The aim of this exercise is to gain some experience of working with the Pandas Series data structure.

#### 1.	Define a Series object holding the values 1 to 10.
#### 2.	Display the data values of the Series object defined in Step 1.
#### 3.	Display the index values of the Series object defined in Step 1.
#### 4.	Define a new Series object holding the values 1 to 10, with the corresponding index values set ‘a’ through to ‘j’.
#### 5.	Display the data values and index of the Series object of Step 4.
#### 6.	Access the third and fifth elements of the Series objects using their index.
#### 7.	Define the following dictionary: {'Dublin': 200000, 'Athlone': 15000, 'Galway': 700000}.
#### 8.	Define the following array: ['Dublin', 'Athlone', 'Waterford'].
#### 9.	Now, construct a Series object using the dictionary in Step 7 and the index in Step 8.
#### 10.	Display the Series object defined in Step 9.
#### 11.	Use the Series notnull() and isnull() methods to display which elements are not null and null, respectively, for the Series object defined in Step 9.

<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Use a native Python function instead of enumerating the values 1 - 10 by hand or a numpy function might even be better
<br>
Remember a Python trick that a string can be turned into a list of letters
<br>
Remember numeric indexes are zero based
<br>
<br>
</p>
</details>


<details><summary>Click for <b>code</b></summary>
<p>

```python
import pandas as pd
s1 = pd.Series(range(1,11))
print(s1)
print(s1.index)

import numpy as np
s2 = pd.Series(np.arange(1,11), index=list('abcdefghij'))
print(s2)
print(s2.index)

print(s1[2], s1[4], s2['c'], s2['e'])

d1 = {'Dublin': 200000, 'Athlone': 15000, 'Galway': 700000, 'Belfast':None}
a1 = ['Dublin', 'Athlone', 'Waterford']

s3 = pd.Series(d1, index = a1)
print(s3)

print(s3.isnull(), s3.notnull())
```
</p>
</details>

## You can pass in a dictionary of lists to define a DataFrame in column strips

In [54]:
from pandas import DataFrame
data = {'team' : ['Leicester', 'Manchester City', 'Arsenal'], 
        'player' : ['Vardy', 'Aguero', 'Sanchez'], 
        'goals' : [24,22,19]}
football = DataFrame(data)
print(football)

              team   player  goals
0        Leicester    Vardy     24
1  Manchester City   Aguero     22
2          Arsenal  Sanchez     19


## Or pass in a list of dictionaries to define the DataFrame by rows

In [55]:
data = [{'team':'Leicester', 'player':'Vardy', 'goals':24}
        ,{'team':'Manchester City', 'player':'Aguero', 'goals':22}
        ,{'team':'Arsenal', 'player':'Sanchez', 'goals':19}]
football = DataFrame(data)
print(football)

   goals   player             team
0     24    Vardy        Leicester
1     22   Aguero  Manchester City
2     19  Sanchez          Arsenal


## Just like a Series you can supply the index parameter to override the default zero based numeric values

In [68]:
from pandas import DataFrame
data = {'team' : ['Leicester', 'Manchester City', 'Arsenal'], 
        'player' : ['Vardy', 'Aguero', 'Sanchez'], 
        'goals' : [24,22,19]}
football = DataFrame(data, index = ['one', 'two', 'three'])
# print(football)

# print(type(football['goals']))
# display(football['goals'])

# print(type(football[['goals']]))
#display(football[['goals']])

#print(football[['goals', 'player']])
#print(football[1])

KeyError: 1

## Just like you can see if a key exists in a dictionary, you can check to see if either a row or column index exists in a DataFrame

In [71]:
print(DataFrame(np.arange(9).reshape((3,3)), index = list('abc'), columns = list('xyz')))

   x  y  z
a  0  1  2
b  3  4  5
c  6  7  8


In [None]:
print(football)
print('player' in football.columns)
print('three' in football.index)

## LAB 2: ## 

### The aim of this exercise is to gain some experience of working with Pandas DataFrame data structure.

#### 1.	Try to encode the following data into a DataFrame object:

| id | name | age |
| -- | ---- | --- |
| 1 | Jack | 30 |
| 2 | Mary| 40 |
| 3 | Mike | 35 |
| 4 | Susan | 25 |


<br>
<details><summary>Click for <b>hint</b></summary>
<p>
First create either a list of dictionaries or dictionary of lists to encode the data
<br>
You want id to be the index column so exclude that from your dictionary and supply it separately<br>
<br>
<br>
</p>
</details>


<details><summary>Click for <b>code</b></summary>
<p>

```python
import pandas as pd
from pandas import DataFrame

# Dictionary of lists
d = {'name':['Jack', 'Mary', 'Mike', 'Susan'],
     'age' : [30, 40, 35, 25]}

df = DataFrame(d, index = [1, 2, 3, 4])
print(df)

# List of Dictionaries
d = [{'name' : 'Jack', 'age' : 30}
    ,{'name' : 'Mary', 'age' : 40}
    ,{'name' : 'Mike', 'age' : 35}
    ,{'name' : 'Susan', 'age' : 25}
    ]

df = DataFrame(d, index = [1, 2, 3, 4])
print(df)

# Or if you have one big dictionary, you can use list comprehension magic to pull out the id column

d = [{'id' : 1, 'name' : 'Jack', 'age' : 30}
    ,{'id' : 2, 'name' : 'Mary', 'age' : 40}
    ,{'id' : 3, 'name' : 'Mike', 'age' : 35}
    ,{'id' : 4, 'name' : 'Susan', 'age' : 25}
    ]

df = DataFrame(d, index = (x['id'] for x in d), columns = ['name', 'age'])
print(df)

```
</p>
</details>

In [126]:
# d1 = DataFrame(
#     [{'id':1, 'name':'Jack', 'age':30}
#     ,{'id':2, 'name':'Mary', 'age':40}]
# )

# d2 = DataFrame( {'id' : [1, 2, 3, 4]
#                   ,'name' : ['Jack', 'Mary', 'Mike', 'Susan']
#                   , 'age' : [30,40,35,25]}
# )

# display(d1)
# display(d2)


d = [{'id' : 1, 'name' : 'Jack', 'age' : 30, 'amount':10}
    ,{'id' : 2, 'name' : 'Mary', 'age' : 40, 'amount':150}
    ,{'id' : 3, 'name' : 'Adam', 'age' : 35, 'amount':140}
    ,{'id' : 4, 'name' : 'Susan', 'age' : 25, 'amount':120}
    ]

#print([x['id'] for x in d])
# df = DataFrame(d, index = columns = ['name', 'age'])
df = DataFrame(d, index = (x['id'] for x in d), columns = ['name', 'age', 'amount'], dtype = float)
#df = DataFrame(d, index = (x['name'] for x in d), columns = ['id', 'age', 'amount'], dtype = float)

# display(df[['name', 'age']])
# display(df[1:3])
#display(df[[True, False, True, False]])
#display(df[df['age'] < 40])
#print(df['age'])
display(df)
#print(df.iloc[1:3])
# print(df.iloc[1])
# # print(df.loc[1])

# print(df.iloc[1:3])
# print(df.loc['Jack' : 'Adam'])
# print(df.ix['Adam'])
#display(df.iloc[1:3][['name', 'age']])
#display(df.iloc[1:3, [0, 1]])
#display(df.loc[2:3, ['name', 'age']])
#display(df.iloc[:, [0, 2]])
display(df.loc[:, ['name', 'amount']])
display(df[['name', 'amount']])


Unnamed: 0,name,age,amount
1,Jack,30.0,10.0
2,Mary,40.0,150.0
3,Adam,35.0,140.0
4,Susan,25.0,120.0


Unnamed: 0,name,amount
1,Jack,10.0
2,Mary,150.0
3,Adam,140.0
4,Susan,120.0


Unnamed: 0,name,amount
1,Jack,10.0
2,Mary,150.0
3,Adam,140.0
4,Susan,120.0


## Data from Series can be retrieved by index using integers and indexes

In [None]:
import numpy as np
data = Series(np.arange(4.0), index=['a','b','c','d'])
print(data)
print('item 2', data[2], sep = '\n')
print('item b', data['b'], sep = '\n')
print('items 0:2', data[0:2], sep = '\n')
print('items b & d', data[['b','d']], sep = '\n')
print('True/False', data[[True, False, True, False]], sep = '\n')
print('Less than 2', data < 2, sep = '\n')
print('Data Less than 2', data[data<2], sep = '\n')


## The same is true for DataFrames

In [None]:
data = DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['one','two','three'])
print(data)
print('three', data['three'], sep = '\n') # fetchs all rows just column three
print('one & three', data[['one', 'three']], sep = '\n') # note the double brackets

print('But if you give it a range it interprets it as a range of rows not columns')
print(data[0:2])

print(data[[True, False, True]])
print(data['two'] > 1)
print(data[data['two'] > 1])

## All of these fetch the first row and all columns


In [None]:
display(data.ix[0], data.ix['a'], data.loc['a'], data.iloc[0])


## All of these retrieve the second column for all rows


In [None]:
display(data.ix[:,'two'], data.ix[:,1], data.loc[:,'two'], data.iloc[:,1])

## All these would retrieve the first two rows and the second column

In [None]:
display(data.ix[['a','b'],'two'], data.ix[0:2,1], data.loc['a':'c':,'two'], data.iloc[0:2,1])


## Like numpy arrays, Series can be operated on but instead of the array sizes needing to match, it uses the index of the Series to decide what matches with what for the math operation

In [None]:
data1 = Series([1.0, 2.0, 3.0], index=['a','d','e'])
data2 = Series([2.0, 3.0, 4.0, 5.0], index=['a','b','c','e'])
display(data1 + data2)

## DataFrames are just a bunch of Series columns with the same index, so you can also operate on a DataFrame the same way

In [127]:
data1 = DataFrame(np.arange(9.0).reshape((3,3)), columns=list('abc'), index=['one','two','three'])
data2 = DataFrame(np.arange(12.0).reshape((4,3)), columns=list('ace'), index=['one','two','three','four'])
display(data1 + data2)

Unnamed: 0,a,b,c,e
four,,,,
one,0.0,,3.0,
three,12.0,,15.0,
two,6.0,,9.0,


In [128]:
display(data1)
display(data2)
display(data1.add(data2, fill_value=0))

Unnamed: 0,a,b,c
one,0.0,1.0,2.0
two,3.0,4.0,5.0
three,6.0,7.0,8.0


Unnamed: 0,a,c,e
one,0.0,1.0,2.0
two,3.0,4.0,5.0
three,6.0,7.0,8.0
four,9.0,10.0,11.0


Unnamed: 0,a,b,c,e
four,9.0,,10.0,11.0
one,0.0,1.0,3.0,2.0
three,12.0,7.0,15.0,8.0
two,6.0,4.0,9.0,5.0


## You can apply a function to each element at a time with the apply function, similar to the Python map function

In [130]:
display(data1)
display(data1.apply(lambda x : x * 2))
display(data1)


Unnamed: 0,a,b,c
one,0.0,1.0,2.0
two,3.0,4.0,5.0
three,6.0,7.0,8.0


Unnamed: 0,a,b,c
one,0.0,2.0,4.0
two,6.0,8.0,10.0
three,12.0,14.0,16.0


Unnamed: 0,a,b,c
one,0.0,1.0,2.0
two,3.0,4.0,5.0
three,6.0,7.0,8.0


## Using the mean function it would apply it to each column instead of each element

In [131]:
display(data1.apply(np.sum))

a     9.0
b    12.0
c    15.0
dtype: float64

## That would be the same as supplying the axis = 0 parameter

In [147]:
display(data1)
display(data1.apply(lambda x : x.max() - x.min(), axis = 1))

Unnamed: 0,a,b,c
one,0.0,1.0,2.0
two,3.0,4.0,5.0
three,6.0,7.0,8.0


one      2.0
two      2.0
three    2.0
dtype: float64

## Axis 1 would yield the row mean instead

In [144]:
print(data1[['a', 'b']].sum())
print(data1[['a','c']].mean())

a     9.0
b    12.0
dtype: float64
a    3.0
c    5.0
dtype: float64


In [133]:
display(data1.apply(np.sum, axis = 1))

one       3.0
two      12.0
three    21.0
dtype: float64

## For built in functions like mean, sum, etc. we could just directly call them, but if you needed to make a custom function, then you'd need apply

In [148]:
display(data1.mean())
display(data1.mean(axis = 1))

display(data1.apply(lambda x : x.max() ** 2  - x.min() ** 2))


a    3.0
b    4.0
c    5.0
dtype: float64

one      1.0
two      4.0
three    7.0
dtype: float64

a    36.0
b    48.0
c    60.0
dtype: float64

## You can sort by the row index in ascending or descending order

In [149]:
display(data1.sort_index(ascending=True))
display(data1.sort_index(ascending=False))

Unnamed: 0,a,b,c
one,0.0,1.0,2.0
three,6.0,7.0,8.0
two,3.0,4.0,5.0


Unnamed: 0,a,b,c
two,3.0,4.0,5.0
three,6.0,7.0,8.0
one,0.0,1.0,2.0


## You can sort by a column value or multiple columns

In [152]:
data1 = DataFrame(np.arange(9.0).reshape((3,3)), columns=list('abc'), index=['one','two','three'])
data1.loc['three','b'] = 1
data1.loc['three','c'] = 1

#display(data1.sort_values(by='b', ascending=True))
#display(data1.sort_values(by=['b', 'c'], ascending=True))
display(data1.sort_values(by=['b', 'c'], ascending=[True, False]))


Unnamed: 0,a,b,c
one,0.0,1.0,2.0
three,6.0,1.0,1.0
two,3.0,4.0,5.0


In [153]:
data2 = DataFrame({'b':[1,4,3,2], 'a':[6,9,20,3], 'c':[7,2,8,15]})
display(data2.rank())
display(data2.rank(axis = 1))
help(data2.rank)

Unnamed: 0,b,a,c
0,1.0,2.0,2.0
1,4.0,3.0,1.0
2,3.0,4.0,3.0
3,2.0,1.0,4.0


Unnamed: 0,b,a,c
0,1.0,2.0,3.0
1,2.0,3.0,1.0
2,1.0,3.0,2.0
3,1.0,2.0,3.0


Help on method rank in module pandas.core.generic:

rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) method of pandas.core.frame.DataFrame instance
    Compute numerical data ranks (1 through n) along axis. Equal values are
    assigned a rank that is the average of the ranks of those values.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        index to direct ranking
    method : {'average', 'min', 'max', 'first', 'dense'}
        * average: average rank of group
        * min: lowest rank in group
        * max: highest rank in group
        * first: ranks assigned in order they appear in the array
        * dense: like 'min', but rank always increases by 1 between groups
    numeric_only : boolean, default None
        Include only float, int, boolean data. Valid only for DataFrame or
        Panel objects
    na_option : {'keep', 'top', 'bottom'}
        * keep: leave NA values where they are
 

## You can read from a file and there are a ton of different parameters to experiment with to get it to read just right

In [166]:
import pandas as pd
#help(pd.read_csv)
#display(pd.read_csv('sample.csv'))
#display(pd.read_csv('sample.csv', header = 'infer'))
#display(pd.read_csv('sample.csv', header = None))
#display(pd.read_csv('sample.csv', header = None, index_col = 0))
#display(pd.read_csv('sample.csv', header = None, index_col = 0, names = ['one', 'two', 'three', 'four']))
display(pd.read_csv('sample2.csv', index_col=0)) #, header = None, index_col = 0, names = ['one', 'two', 'three', 'four']))


Unnamed: 0_level_0,amount,number,something,word
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,goodbye


In [168]:
import json
data = json.loads(open('example.json').read())
print(data)
customers = DataFrame(data['customers'])
display(customers)

{'name': 'jayne', 'role': 'sales', 'customers': [{'name': 'Andersons', 'product': 'Bosch', 'quantity': 100}, {'name': 'ElectricalDirect', 'product': 'Miele', 'quantity': 200}]}


Unnamed: 0,name,product,quantity
0,Andersons,Bosch,100
1,ElectricalDirect,Miele,200


## You can also read from SQL tables, you just need the correct library for the version of SQL you want to connect to

In [170]:
import sqlite3
cn = sqlite3.connect('test.sqlite')
# curs = cn.cursor()
# curs.execute("create table names (id int, name varchar(20))")
# curs.execute("insert into names values(1, 'Alice'), (2, 'Bob')")
# cn.commit()
# curs.execute("select * from names")
# names = curs.fetchall()
# print(names)
names2 = pd.read_sql_query("select * from names", cn)
display(names2)
cn.close()


Unnamed: 0,id,name
0,1,Alice
1,2,Bob


In [172]:
data = DataFrame([[1,np.nan],[3,4],[5,np.nan]], columns=['a','b'])
display(data)
print(data.sum())
print(data.sum(axis = 1))
display(data.describe())

Unnamed: 0,a,b
0,1,
1,3,4.0
2,5,


a    9.0
b    4.0
dtype: float64
0    1.0
1    7.0
2    5.0
dtype: float64


Unnamed: 0,a,b
count,3.0,1.0
mean,3.0,4.0
std,2.0,
min,1.0,4.0
25%,2.0,4.0
50%,3.0,4.0
75%,4.0,4.0
max,5.0,4.0


## Pandas_datareader is a downloadable package that can fetch stock information

In [174]:
# ! pip install pandas_datareader
import pandas_datareader.data as web

# Python code to get the date 90 days ago
import datetime 
tod = datetime.datetime.now()
d = datetime.timedelta(days = 90)
a = tod - d
#print(a)
        
        
display(web.get_data_yahoo('AAPL', start = a)) 

# List comprehension to get several stocks as a dictionary, so we can turn them into a DataFrame
all_data = {ticker: web.get_data_yahoo(ticker, start = a) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

print(all_data)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-16,259.079987,240.000000,241.949997,242.210007,80605900.0,241.556122
2020-03-17,257.609985,238.399994,247.509995,252.860001,81014000.0,252.177368
2020-03-18,250.000000,237.119995,239.770004,246.669998,75058400.0,246.004074
2020-03-19,252.839996,242.610001,247.389999,244.779999,67964300.0,244.119171
2020-03-20,251.830002,228.000000,247.179993,229.240005,100423300.0,228.621140
2020-03-23,228.500000,212.610001,228.080002,224.369995,84188200.0,223.764267
2020-03-24,247.690002,234.300003,236.360001,246.880005,71882800.0,246.213516
2020-03-25,258.250000,244.300003,250.750000,245.520004,75900500.0,244.857178
2020-03-26,258.679993,246.360001,246.520004,258.440002,63021800.0,257.742310
2020-03-27,255.869995,247.050003,252.750000,247.740005,51054200.0,247.071182


{'AAPL':                   High         Low        Open       Close       Volume  \
Date                                                                      
2020-03-16  259.079987  240.000000  241.949997  242.210007   80605900.0   
2020-03-17  257.609985  238.399994  247.509995  252.860001   81014000.0   
2020-03-18  250.000000  237.119995  239.770004  246.669998   75058400.0   
2020-03-19  252.839996  242.610001  247.389999  244.779999   67964300.0   
2020-03-20  251.830002  228.000000  247.179993  229.240005  100423300.0   
2020-03-23  228.500000  212.610001  228.080002  224.369995   84188200.0   
2020-03-24  247.690002  234.300003  236.360001  246.880005   71882800.0   
2020-03-25  258.250000  244.300003  250.750000  245.520004   75900500.0   
2020-03-26  258.679993  246.360001  246.520004  258.440002   63021800.0   
2020-03-27  255.869995  247.050003  252.750000  247.740005   51054200.0   
2020-03-30  255.520004  249.399994  250.740005  254.809998   41994100.0   
2020-03-31  262.

In [176]:
stock_close = DataFrame({ticker:data['Adj Close'] for ticker, data in all_data.items()})
display(stock_close[:5])
display(stock_close.pct_change()[:5])

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-16,241.556122,97.768806,135.043884,1084.329956
2020-03-17,252.177368,105.238625,146.162933,1119.800049
2020-03-18,246.004074,102.179649,140.010056,1096.800049
2020-03-19,244.119171,99.012123,142.31366,1115.290039
2020-03-20,228.62114,94.127632,136.968536,1072.319946


Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-16,,,,
2020-03-17,0.04397,0.076403,0.082337,0.032712
2020-03-18,-0.02448,-0.029067,-0.042096,-0.020539
2020-03-19,-0.007662,-0.031,0.016453,0.016858
2020-03-20,-0.063486,-0.049332,-0.037559,-0.038528


In [None]:
display(stock_close.corr())
display(stock_close.cov())

## Handling missing data

In [178]:
from numpy import nan as NA
data = Series([1,NA,2,3,4,NA])
display(data)
data1 = data.dropna()
display(data1)
display(data)
data.dropna(inplace = True)
display(data)

0    1.0
1    NaN
2    2.0
3    3.0
4    4.0
5    NaN
dtype: float64

0    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    3.0
4    4.0
5    NaN
dtype: float64

0    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

## Works the same for DataFrames, but with some additional options

In [181]:
# data = DataFrame ([[1,2,3],[NA,5,NA],[NA,NA,NA],[10,11,12]])
# display(data)

# display(data.dropna(how = 'all')) # works by row and only drops the row if all columns are NaN
# display(data.dropna(how = 'any')) # works by row and only drops the row if any columns are NaN

data1 = DataFrame ([[1,2,NA],[NA,5,NA],[NA,12,NA],[10,11,NA]])
display(data1)
display(data1.dropna(how = 'all', axis = 1)) # works by col and only drops the col if all rows are NaN
display(data1.dropna(how = 'any', axis = 1)) # works by col and only drops the col if any rows are NaN


Unnamed: 0,0,1,2
0,1.0,2,
1,,5,
2,,12,
3,10.0,11,


Unnamed: 0,0,1
0,1.0,2
1,,5
2,,12
3,10.0,11


Unnamed: 0,1
0,2
1,5
2,12
3,11


## Instead of removing missing data you may wish to replace it with another values

In [None]:
display(data)
filled = data.fillna(0)
display(filled)
filled = data.fillna({0:10, 1:20})
display(filled)
filled = data.fillna(data.mean())
display(filled)



## Replacings the Nulls with the row mean is a little trickier, but transposing works to flip to rows and columns so you can calculate the row means as if they were column means

In [None]:
data = DataFrame ([[1,2,3],[4,5,NA],[6,NA,NA],[10,NA,12]])
display(data)
filled = data.T.fillna(data.T.mean()).T
#filled = data.fillna(data.mean(axis = 1))
display(filled)


## Homework: ## 
#### 1.	Read the file categories.csv. 
#### 2.	Use the first column as the index column for the DataFrame
#### 3.	Print the DataFrame, and the first two elements only of the DataFrame
#### 4. Load the products.json file
#### 5.	Display only the products in category 1
#### 6. Display the products from highest to lowest price
<br>
<details><summary>Click for <b>hint</b></summary>
<p>
Check the parameters for read_csv to make sure you read the column names right and use the proper column for the index column
<br>
Remember there are many functions to pull a slice from a DataFrame so experiment until you find the right one
<br>
The syntax to filter on a condition is odd, but it's in the slides
<br>
There are several functions to sort, so choose the right one to sort on price
<br>
<br>
</p>
</details>

