# Chapter 16: DataFrames

In [6]:
df = {
    'index':[0,1,2],
    'cols':[
        {'name':'growth',
         'data':[0.5,0.7,0.8]},
        {'name':'value',
         'data':[0.1,0.2,0.3]},
        {'name':'premium',
         'data':[2,3,4]}
    ]
}

In [9]:
df

{'index': [0, 1, 2],
 'cols': [{'name': 'growth', 'data': [0.5, 0.7, 0.8]},
  {'name': 'value', 'data': [0.1, 0.2, 0.3]},
  {'name': 'premium', 'data': [2, 3, 4]}]}

In [8]:
print(df)

{'index': [0, 1, 2], 'cols': [{'name': 'growth', 'data': [0.5, 0.7, 0.8]}, {'name': 'value', 'data': [0.1, 0.2, 0.3]}, {'name': 'premium', 'data': [2, 3, 4]}]}


In [4]:
import pandas as pd

In [7]:
df2 = pd.DataFrame(df)
print(df2)

   index                                         cols
0      0  {'name': 'growth', 'data': [0.5, 0.7, 0.8]}
1      1   {'name': 'value', 'data': [0.1, 0.2, 0.3]}
2      2       {'name': 'premium', 'data': [2, 3, 4]}


In [None]:
df = {
    'index':[0,1,2],
    'cols':[
        {'name':'growth',
        'data':[0.5, 0.7, 1.2]},
        {'name': 'Name',
        'data':['Paul', 'George', 'Ringo']}
    ]
}

In [12]:
df

{'index': [0, 1, 2],
 'cols': [{'name': 'growth', 'data': [0.5, 0.7, 0.8]},
  {'name': 'value', 'data': [0.1, 0.2, 0.3]},
  {'name': 'premium', 'data': [2, 3, 4]}]}

In [10]:
def get_row(df, idx):
    results = []
    value_idx = df['index'].index(idx)
    for col in df['cols']:
        results.append(col['data'][value_idx])
    return results

In [11]:
get_row(df,1)

[0.7, 0.2, 3]

## 16.3 DataFrames

In [13]:
import pandas as pd

In [60]:
df = pd.DataFrame({'growth':[0.5,0.7,0.8], 'value':[0.1,0.2,0.3], 'premium':[2,3,4]})

In [15]:
df

Unnamed: 0,growth,value,premium
0,0.5,0.1,2
1,0.7,0.2,3
2,0.8,0.3,4


In [16]:
df.iloc[1]

Unnamed: 0,1
growth,0.7
value,0.2
premium,3.0


In [17]:
df.iloc[0:2]

Unnamed: 0,growth,value,premium
0,0.5,0.1,2
1,0.7,0.2,3


In [20]:
df.loc[:2]

Unnamed: 0,growth,value,premium
0,0.5,0.1,2
1,0.7,0.2,3
2,0.8,0.3,4


In [21]:
df

Unnamed: 0,growth,value,premium
0,0.5,0.1,2
1,0.7,0.2,3
2,0.8,0.3,4


In [22]:
df.growth


Unnamed: 0,growth
0,0.5
1,0.7
2,0.8


In [23]:
df['growth']

Unnamed: 0,growth
0,0.5
1,0.7
2,0.8


In [24]:
type(df.growth)

In [61]:
df['name'] = ['Jiawen', 'Jiahua', 'Jiamei']

In [26]:
df

Unnamed: 0,growth,value,premium,name
0,0.5,0.1,2,Jiawen
1,0.7,0.2,3,Jiahua
2,0.8,0.3,4,Jiamei


In [27]:
df['name'].str.lower()

Unnamed: 0,name
0,jiawen
1,jiahua
2,jiamei


In [30]:
df.Name()

AttributeError: 'DataFrame' object has no attribute 'Name'

In [29]:
df.name

Unnamed: 0,name
0,Jiawen
1,Jiahua
2,Jiamei


In [None]:
df = pd.DataFrame({
    'growth': [0.5, 0.7, 1.2],
    'Name': ['Paul', 'George', 'Ringo']
})

In [None]:
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


In [None]:
df.iloc[2]

growth      1.2
Name      Ringo
Name: 2, dtype: object

In [None]:
df['Name']

0      Paul
1    George
2     Ringo
Name: Name, dtype: object

- Any operation that can be done to a series can be applied to a column

In [None]:
df['Name'].str.lower()

0      paul
1    george
2     ringo
Name: Name, dtype: object

In [31]:
from io import StringIO

In [34]:
csv_file = StringIO(""" growth, Name
.5, Paul
.7, George"""
)

In [35]:
pd.read_csv(csv_file)

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George


In [38]:
csv_file2 = StringIO(""" .5, Paul
.7, George"""
)

In [39]:
pd.read_csv(csv_file2)

Unnamed: 0,.5,Paul
0,0.7,George


## 16.4 Construction

Dataframes can be created from many types of inputs
- columns (dicts of lists)
- rows(list of dicts)
- CSV files (pd.read_csv)
- NumPy ndarrays
- other: SQL, HDF5, arrow etc

In [None]:
# creating a dataframe from rows
pd.DataFrame([
    {'growth':0.5, 'Name':'Paul'},
    {'growth':0.7, 'Name':'Geroge'},
    {'growth':1.2, 'Name':'Ringo'}
])

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,Geroge
2,1.2,Ringo


In [40]:
import numpy as np
np.random.seed(42)

In [42]:
pd.DataFrame(np.random.randn(30,2), columns=['a','b'])

Unnamed: 0,a,b
0,1.579213,0.767435
1,-0.469474,0.54256
2,-0.463418,-0.46573
3,0.241962,-1.91328
4,-1.724918,-0.562288
5,-1.012831,0.314247
6,-0.908024,-1.412304
7,1.465649,-0.225776
8,0.067528,-1.424748
9,-0.544383,0.110923


In [43]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['growth', 'value', 'premium', 'name'], dtype='object')]

In [44]:
df

Unnamed: 0,growth,value,premium,name
0,0.5,0.1,2,Jiawen
1,0.7,0.2,3,Jiahua
2,0.8,0.3,4,Jiamei


In [45]:
df.sum(axis=0)

Unnamed: 0,0
growth,2.0
value,0.6
premium,9
name,JiawenJiahuaJiamei


In [46]:
df.sum(axis=1)

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [47]:
df.sum(axis='index')

Unnamed: 0,0
growth,2.0
value,0.6
premium,9
name,JiawenJiahuaJiamei


In [50]:
df.axes[1]

Index(['growth', 'value', 'premium', 'name'], dtype='object')

In [72]:
df2 = pd.DataFrame ({'Score1': [None , None],
'Score2': [85, 90]})

In [73]:
df2.columns

Index(['Score1', 'Score2'], dtype='object')

In [74]:
df2

Unnamed: 0,Score1,Score2
0,,85
1,,90


In [67]:
df

Unnamed: 0,growth,value,premium,name
0,0.5,0.1,2,Jiawen
1,0.7,0.2,3,Jiahua
2,0.8,0.3,4,Jiamei


In [68]:
df['name']

Unnamed: 0,name
0,Jiawen
1,Jiahua
2,Jiamei


In [75]:
df2.Score1

Unnamed: 0,Score1
0,
1,


In [76]:
df2.sum(axis=0)

Unnamed: 0,0
Score1,0
Score2,175


In [77]:
df.sum(axis=0)

Unnamed: 0,0
growth,2.0
value,0.6
premium,9
name,JiawenJiahuaJiamei


In [78]:
df.apply(np.sum, axis=0)

Unnamed: 0,0
growth,2.0
value,0.6
premium,9
name,JiawenJiahuaJiamei


In [88]:
df

Unnamed: 0,growth,value,premium,name
0,0.5,0.1,2,Jiawen
1,0.7,0.2,3,Jiahua
2,0.8,0.3,4,Jiamei


In [86]:
df.drop('name', axis = 1).mean(axis=0)

Unnamed: 0,0
growth,0.666667
value,0.2
premium,3.0


In [87]:
df.drop('name', axis = 1).mean(axis=1)

Unnamed: 0,0
0,0.866667
1,1.3
2,1.7


In [90]:
df.drop(columns='name')

Unnamed: 0,growth,value,premium
0,0.5,0.1,2
1,0.7,0.2,3
2,0.8,0.3,4


- After parsing the CSV file, pandas makes a best-effort to give a type to each column
- Best effort means it will convert numerics to int64 if the columns is whole numbers and not missing values
- other numeric columns are converted to float64 if they have decimals or are missing values
- If there are non-numeric values, pandas will use the object type

## 16.5 DataFrame Axis

A Dataframe has two axis unlike series
- 0: index/row axis
- 1: columns axis

In [None]:
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


In [None]:
df.sum(axis=0)

growth                2.4
Name      PaulGeorgeRingo
dtype: object

In [None]:
df.sum(axis='index')

growth                2.4
Name      PaulGeorgeRingo
dtype: object

In [None]:
df.sum(axis='columns')

  df.sum(axis='columns')


0    0.5
1    0.7
2    1.2
dtype: float64