In [1]:
"""
Tabular data storage format (csv equivalent)
Multiple series that share the same index
"""
import pandas as pd
import numpy as np

In [10]:
from numpy.random import randn
np.random.seed(101)

In [13]:
r_mat2 = np.random.rand(5, 4)

In [14]:
print(r_mat2)

[[0.68530633 0.51786747 0.04848454 0.13786924]
 [0.18696743 0.9943179  0.5206654  0.57878954]
 [0.73481906 0.54196177 0.91315356 0.80792015]
 [0.40299783 0.35722434 0.95287671 0.34363158]
 [0.86509982 0.83027771 0.53816145 0.92246937]]


In [18]:
df = pd.DataFrame(data=r_mat2, 
                 index="A B C D E".split())

In [49]:
"""
Conditionals
"""

cond = df[0] > 0.5
cond2 = df[2] > 0.2
df[ (cond) & (cond2)]

df
df.reset_index()

Unnamed: 0,index,0,1,2,3
0,A,0.685306,0.517867,0.048485,0.137869
1,B,0.186967,0.994318,0.520665,0.57879
2,C,0.734819,0.541962,0.913154,0.80792
3,D,0.402998,0.357224,0.952877,0.343632
4,E,0.8651,0.830278,0.538161,0.922469


In [82]:
states = ['NY', 'PA', 'WY', 'OR', 'FL']
df['States'] = states
df.set_index('States')
print(df)
print("\n")
df.info()

          0         1         2         3 States
A  0.685306  0.517867  0.048485  0.137869     NY
B  0.186967  0.994318  0.520665  0.578790     PA
C  0.734819  0.541962  0.913154  0.807920     WY
D  0.402998  0.357224  0.952877  0.343632     OR
E  0.865100  0.830278  0.538161  0.922469     FL


<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       5 non-null      float64
 1   1       5 non-null      float64
 2   2       5 non-null      float64
 3   3       5 non-null      float64
 4   States  5 non-null      object 
dtypes: float64(4), object(1)
memory usage: 412.0+ bytes


In [61]:
df.describe()

Unnamed: 0,0,1,2,3
count,5.0,5.0,5.0,5.0
mean,0.575038,0.64833,0.594668,0.558136
std,0.274782,0.257823,0.36631,0.32334
min,0.186967,0.357224,0.048485,0.137869
25%,0.402998,0.517867,0.520665,0.343632
50%,0.685306,0.541962,0.538161,0.57879
75%,0.734819,0.830278,0.913154,0.80792
max,0.8651,0.994318,0.952877,0.922469


In [62]:
ser_w = df[0] > 0.2

In [64]:
ser_w.value_counts()

True     4
False    1
Name: 0, dtype: int64

A     True
B    False
C     True
D     True
E     True
Name: 0, dtype: bool

In [75]:
"""
Group by operations
"""
# we may watn to perform analyses based on values of a specific column.
# group by operations -> split, apply, combine.
# group by operation is like the SQL equivalent
categories = ['A', 'A', 'B', 'B', 'C', 'D']
data_values = [10, 5, 2, 4, 12, 6]
# a dataframe can contain a hashmap data value w/o including additional 
# index and parameter values like so: 
dd = {
    "Company": ["GOOG", "GOOG", "MSFT", "FB", "FB", "MSFT"], 
    "Person":  ["Sam", "Charlie", "Amy", "Vanessa", "Carl", "Sarah"], 
    "Sales": [200, 120, 300, 120, 440, 200]
}
data2 = pd.DataFrame(data=dd)
data = pd.DataFrame(data=data_values, 
                    index=categories)
data2

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,300
3,FB,Vanessa,120
4,FB,Carl,440
5,MSFT,Sarah,200


In [77]:
# group by values here. 
data2.groupby("Company")
data2.groupby("Company").mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,280
GOOG,160
MSFT,250


In [83]:
data2.groupby("Company").mean().T

Company,FB,GOOG,MSFT
Sales,280,160,250


In [102]:
"""
Pandas operations
"""
import pandas as pd
import base64
df = pd.DataFrame({
    'col1': [1, 2,3,4], 
    'col2': [45,23,12,53], 
    'col3': ['abc', 'sdwe', 'asdf', 'gser']
})
values = []
for i in range(len(df['col3'])): 
    b64_string = base64.b64encode(df['col3'][i].encode('utf-8'))
    values.append(b64_string)
df['col3'] = values
df.head()

Unnamed: 0,col1,col2,col3
0,1,45,b'YWJj'
1,2,23,b'c2R3ZQ=='
2,3,12,b'YXNkZg=='
3,4,53,b'Z3Nlcg=='


In [114]:
df['col1'].apply(times_two)
df['col4'] = df['col2'].apply(lambda x: x * 2)
df

Unnamed: 0,col1,col2,col3,col4
0,1,45,90,90
1,2,23,46,46
2,3,12,24,24
3,4,53,106,106


In [115]:
del df['col4']

In [116]:
df

Unnamed: 0,col1,col2,col3
0,1,45,90
1,2,23,46
2,3,12,24
3,4,53,106


In [119]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [120]:
"""
Data Inputs and Outputs
"""
import pandas as pd
import numpy as np

# pandas reads in tabular data formats


AttributeError: 'DataFrame' object has no attribute 'rows'