In [1]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import fix_yahoo_finance as yf

yf.pdr_override()  

# create DataFrame

In [2]:
data = {
    'string': ['a', 'b', 'c', 'd', 'e', 'f'],
    'int': [0, 1, 2, 1, 2, 3],
    'float': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
2,3.6,2,c
3,2.4,1,d
4,2.9,2,e
5,3.2,3,f


In [4]:
dfWithIndex = pd.DataFrame(data, index=["one", "two", "three", "four", "five", "six"])
dfWithIndex

Unnamed: 0,float,int,string
one,1.5,0,a
two,1.7,1,b
three,3.6,2,c
four,2.4,1,d
five,2.9,2,e
six,3.2,3,f


In [5]:
dfWithDict = pd.DataFrame({"float":{"one":1.1, "two":2.2, "three":3.3}, "int":{"two":2, "three":3, "four":4}, "string":{"three":"aaa", "four":"bbb", "five":"ccc"}})
dfWithDict

Unnamed: 0,float,int,string
five,,,ccc
four,,4.0,bbb
one,1.1,,
three,3.3,3.0,aaa
two,2.2,2.0,


# check index is unique

In [6]:
dfWithIndex.index.is_unique

True

# sort by index

In [7]:
dfWithIndex.sort_index()

Unnamed: 0,float,int,string
five,2.9,2,e
four,2.4,1,d
one,1.5,0,a
six,3.2,3,f
three,3.6,2,c
two,1.7,1,b


In [8]:
dfWithIndex.sort_index(axis=1, ascending=False)

Unnamed: 0,string,int,float
one,a,0,1.5
two,b,1,1.7
three,c,2,3.6
four,d,1,2.4
five,e,2,2.9
six,f,3,3.2


# sort by value

In [9]:
dfWithIndex.sort_values(by=["int", "string"])

Unnamed: 0,float,int,string
one,1.5,0,a
two,1.7,1,b
four,2.4,1,d
three,3.6,2,c
five,2.9,2,e
six,3.2,3,f


# head

In [10]:
df.head()

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
2,3.6,2,c
3,2.4,1,d
4,2.9,2,e


# set columns order

In [11]:
columnsOrderDf = pd.DataFrame(data, columns=['string', 'int', 'float'])
columnsOrderDf

Unnamed: 0,string,int,float
0,a,0,1.5
1,b,1,1.7
2,c,2,3.6
3,d,1,2.4
4,e,2,2.9
5,f,3,3.2


# get specific column

In [12]:
df["string"]

0    a
1    b
2    c
3    d
4    e
5    f
Name: string, dtype: object

In [13]:
df.string

0    a
1    b
2    c
3    d
4    e
5    f
Name: string, dtype: object

# get specific row

In [14]:
dfWithIndex.loc["three", ["int", "string"]]

int       2
string    c
Name: three, dtype: object

In [15]:
dfWithIndex.iloc[2, [1, 2]]

int       2
string    c
Name: three, dtype: object

# update value

In [16]:
updateValueDf = pd.DataFrame(data)

In [17]:
updateValueDf["string"] = "x"
updateValueDf

Unnamed: 0,float,int,string
0,1.5,0,x
1,1.7,1,x
2,3.6,2,x
3,2.4,1,x
4,2.9,2,x
5,3.2,3,x


In [18]:
updateValueDf["float"] = np.arange(6.)
updateValueDf

Unnamed: 0,float,int,string
0,0.0,0,x
1,1.0,1,x
2,2.0,2,x
3,3.0,1,x
4,4.0,2,x
5,5.0,3,x


In [19]:
series = pd.Series([100, 300, 500], index=[1, 3, 5])
updateValueDf["float"] = series
updateValueDf

Unnamed: 0,float,int,string
0,,0,x
1,100.0,1,x
2,,2,x
3,300.0,1,x
4,,2,x
5,500.0,3,x


# add new column

In [20]:
addNewColumnDf = pd.DataFrame(data)
addNewColumnDf['new_column'] = addNewColumnDf.int > 1
addNewColumnDf

Unnamed: 0,float,int,string,new_column
0,1.5,0,a,False
1,1.7,1,b,False
2,3.6,2,c,True
3,2.4,1,d,False
4,2.9,2,e,True
5,3.2,3,f,True


# delete column

In [21]:
deleteColumnDf_0 = pd.DataFrame(data)
del deleteColumnDf_0["int"]
deleteColumnDf_0

Unnamed: 0,float,string
0,1.5,a
1,1.7,b
2,3.6,c
3,2.4,d
4,2.9,e
5,3.2,f


In [22]:
deleteColumnDf_1 = pd.DataFrame(data)
deleteColumnDf_1.drop(["string"], axis='columns')

Unnamed: 0,float,int
0,1.5,0
1,1.7,1
2,3.6,2
3,2.4,1
4,2.9,2
5,3.2,3


In [23]:
deleteColumnDf_inplace = pd.DataFrame(data)
deleteColumnDf_inplace.drop(["string"], axis='columns', inplace=True)
deleteColumnDf_inplace

Unnamed: 0,float,int
0,1.5,0
1,1.7,1
2,3.6,2
3,2.4,1
4,2.9,2
5,3.2,3


# delete record

In [24]:
deleteRecordDf = pd.DataFrame(data)
deleteRecordDf.drop([2,4])

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
3,2.4,1,d
5,3.2,3,f


In [25]:
deleteRecordDf_inplace = pd.DataFrame(data)
deleteRecordDf_inplace.drop([2,4], inplace=True)
deleteRecordDf_inplace

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
3,2.4,1,d
5,3.2,3,f


# transpose

In [26]:
transposedDf = pd.DataFrame(data)
transposedDf.T

Unnamed: 0,0,1,2,3,4,5
float,1.5,1.7,3.6,2.4,2.9,3.2
int,0,1,2,1,2,3
string,a,b,c,d,e,f


# sub dataframe

In [27]:
subDfByDict = {"subInt" : df.int[1:5], "subFloat" : df.float[:-3]}
pd.DataFrame(subDfByDict)

Unnamed: 0,subFloat,subInt
0,1.5,
1,1.7,1.0
2,3.6,2.0
3,,1.0
4,,2.0


In [28]:
dfWithIndex.loc["two":"five", "float":"string"]

Unnamed: 0,float,int,string
two,1.7,1,b
three,3.6,2,c
four,2.4,1,d
five,2.9,2,e


In [29]:
df.iloc[:, :3][df.index % 2 == 0]

Unnamed: 0,float,int,string
0,1.5,0,a
2,3.6,2,c
4,2.9,2,e


# get value array

In [30]:
df.values

array([[1.5, 0, 'a'],
       [1.7, 1, 'b'],
       [3.6, 2, 'c'],
       [2.4, 1, 'd'],
       [2.9, 2, 'e'],
       [3.2, 3, 'f']], dtype=object)

# get index

In [31]:
dfWithIndex.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

# get max index

In [32]:
maxIndexDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
maxIndexDf.idxmax()

0    2
1    2
2    2
dtype: int64

# reindex

In [33]:
column_reindex = pd.DataFrame(
    np.arange(9).reshape((3, 3)),
    index=['a', 'c', 'd'],
    columns=['Ohio', 'Texas', 'California']
)
print(column_reindex)
print("-------------------")
print(column_reindex.reindex(columns=['Texas', 'Utah', 'California']))

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
-------------------
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [34]:
df.reindex(index=[1,3,5], columns=["float", "string", "not_exist"])

Unnamed: 0,float,string,not_exist
1,1.7,b,
3,2.4,d,
5,3.2,f,


# add

In [35]:
addDf_0_0 = pd.DataFrame([1,2,3], index=["a", "b", "c"])
addDf_0_1 = pd.DataFrame([20,30,40], index=["b", "c", "d"])
addDf_0_0 + addDf_0_1

Unnamed: 0,0
a,
b,22.0
c,33.0
d,


In [36]:
addDf_1_0 = pd.DataFrame([1,2,3], index=["a", "b", "c"])
addDf_1_1 = pd.DataFrame([20,30,40], index=["b", "c", "d"])
addDf_1_0.add(addDf_1_1, fill_value=0)

Unnamed: 0,0
a,1.0
b,22.0
c,33.0
d,40.0


# sum

In [37]:
sumDf = pd.DataFrame([[1,2,3], [4,None,6], [7,8,9]])
sumDf

Unnamed: 0,0,1,2
0,1,2.0,3
1,4,,6
2,7,8.0,9


In [38]:
sumDf.sum()

0    12.0
1    10.0
2    18.0
dtype: float64

In [39]:
sumDf.sum(skipna=False)

0    12.0
1     NaN
2    18.0
dtype: float64

# accumulation

In [40]:
accumulationDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
accumulationDf.cumsum()

Unnamed: 0,0,1,2
0,1,2,3
1,5,7,9
2,12,15,18


# describe

In [41]:
describeDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
describeDf.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


# apply function

In [42]:
applyFunctionDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
applyFunctionDf

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [43]:
applyFunctionDf.apply(lambda x : x.max())

0    7
1    8
2    9
dtype: int64

In [44]:
applyFunctionDf.apply(lambda x : x.max(), axis='columns')

0    3
1    6
2    9
dtype: int64

In [45]:
applyFunctionDf.apply(lambda x : pd.Series([x.min(), x.max()], index=['min', 'max']))

Unnamed: 0,0,1,2
min,1,2,3
max,7,8,9


# apply map

In [46]:
applyMapDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
applyMapDf.applymap(lambda x : x * 10)

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


# rank

In [47]:
rankDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

In [48]:
rankDf.rank()

Unnamed: 0,0,1,2
0,1.0,1.0,1.0
1,2.0,2.0,2.0
2,3.0,3.0,3.0


In [49]:
rankDf.rank(axis='columns')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,1.0,2.0,3.0
2,1.0,2.0,3.0


# download yahoo financial data

In [51]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded


In [52]:
adj_Close = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
adj_Close.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-21,187.630005,1079.579956,145.490005,97.599998
2018-05-22,187.160004,1069.72998,145.089996,97.5
2018-05-23,188.360001,1079.689941,144.669998,98.660004
2018-05-24,188.149994,1079.23999,144.070007,98.309998
2018-05-25,188.580002,1075.660034,143.639999,98.360001


In [53]:
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})
volume.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-21,18400800.0,1023200.0,2894700,19422500.0
2018-05-22,15240700.0,1090000.0,2284000,15441200.0
2018-05-23,19467900.0,1030000.0,3382000,21114800.0
2018-05-24,23234000.0,766800.0,3398700,26649300.0
2018-05-25,17156600.0,876500.0,4885800,17891700.0


In [55]:
returns = adj_Close.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-21,0.007085,0.012397,0.009786,0.012868
2018-05-22,-0.002505,-0.009124,-0.002749,-0.001025
2018-05-23,0.006412,0.009311,-0.002895,0.011897
2018-05-24,-0.001115,-0.000417,-0.004147,-0.003548
2018-05-25,0.002285,-0.003317,-0.002985,0.000509


# correlation

In [56]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.454516,0.357428,0.387439
GOOG,0.454516,1.0,0.391355,0.464935
IBM,0.357428,0.391355,1.0,0.436187
MSFT,0.387439,0.464935,0.436187,1.0


In [57]:
returns.corrwith(returns['IBM'])

AAPL    0.357428
GOOG    0.391355
IBM     1.000000
MSFT    0.436187
dtype: float64

In [58]:
returns['MSFT'].corr(returns['IBM'])

0.43618655357153235

In [59]:
returns.corrwith(volume)

AAPL   -0.002304
GOOG    0.047857
IBM    -0.000812
MSFT   -0.003730
dtype: float64

# covariance

In [60]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000865,0.000182,0.000177,0.00024
GOOG,0.000182,0.000369,0.0001,0.000146
IBM,0.000177,0.0001,0.000251,0.000163
MSFT,0.00024,0.000146,0.000163,0.000466


In [61]:
returns['MSFT'].cov(returns['IBM'])

0.00016314677673304027