# How to define a Pandas DataFrame whose rows and columns are labeled

In [5]:
import numpy
import pandas as pd

In [2]:
myarray = numpy.array([[10,30,20], [50,40,60],[1000,2000,3000]])   # 3x3 numpy array of integers

In [3]:
rownames = ['apples', 'oranges', 'beer']
colnames = ['January', 'February', 'March']

In [6]:
# initialize a dataframe mydf using pandas
mydf = pandas.DataFrame(myarray, index=rownames,columns=colnames)

In [7]:
print(mydf)  # describes the array

         January  February  March
apples        10        30     20
oranges       50        40     60
beer        1000      2000   3000


In [8]:
print(mydf.describe())   # describes the statistical parameters of data

           January     February        March
count     3.000000     3.000000     3.000000
mean    353.333333   690.000000  1026.666667
std     560.386771  1134.504297  1709.073823
min      10.000000    30.000000    20.000000
25%      30.000000    35.000000    40.000000
50%      50.000000    40.000000    60.000000
75%     525.000000  1020.000000  1530.000000
max    1000.000000  2000.000000  3000.000000


# Define a Pandas DataFrame whose rows and columns are numbers (but the column labels are characters).

In [10]:
import numpy as np

In [11]:
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A','B','C','D'])

In [12]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A','B','C'])

In [13]:
df3 = df1 + df2

In [14]:
df3

Unnamed: 0,A,B,C,D
0,-0.592236,0.004988,-1.805667,
1,-0.566545,1.868322,-1.854218,
2,-2.337748,0.346702,-0.48707,
3,-3.556327,0.870339,-0.931521,
4,-1.704423,-0.086805,0.715384,
5,1.819839,-1.839404,0.514738,
6,-0.391817,0.25503,-1.343797,
7,,,,
8,,,,
9,,,,


# Series  align the Series index on the DataFrame columns; this results in a row-wise output.

In [15]:
names = pd.Series(['SF', 'San Jose', 'Sacramento'])
sizes = pd.Series([852469, 1015785, 485199])

In [16]:
df = pd.DataFrame({ 'Cities': names, 'Size': sizes})

In [17]:
df

Unnamed: 0,Cities,Size
0,SF,852469
1,San Jose,1015785
2,Sacramento,485199


In [19]:
df = pd.DataFrame({ 'City name': names,'sizes':sizes })
df

Unnamed: 0,City name,sizes
0,SF,852469
1,San Jose,1015785
2,Sacramento,485199


# To define a Pandas DataFrame whose rows and columns are Boolean values.

In [20]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1,1] }, dtype=bool)
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1,0] }, dtype=bool)

In [22]:
print("df1 & df2:")
print(df1 & df2)   # LOGICAL AND OPERATION

df1 & df2:
       a      b
0  False  False
1  False   True
2   True  False


In [23]:
print("df1 | df2:")
print(df1 | df2)    # LOGICAL OR OPERATION

df1 | df2:
      a     b
0  True  True
1  True  True
2  True  True


In [24]:
print("df1 ^ df2:")
print(df1 ^ df2)   # NEGATION OPERATION

df1 ^ df2:
       a      b
0   True   True
1   True  False
2  False   True


# To generate the transpose of a Pandas DataFrame, similar to a NumPy ndarray.

In [27]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1,1] }, dtype=int)
df1

Unnamed: 0,a,b
0,1,0
1,0,1
2,1,1


In [26]:
print("df1.T:")
print(df1.T)

df1.T:
   0  1  2
a  1  0  1
b  0  1  1


# SUM OF df1 and df2

In [29]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1,1] }, dtype=int)
df2 = pd.DataFrame({'a' : [3, 3, 3], 'b' : [5, 5,5] }, dtype=int)
df1,df2

(   a  b
 0  1  0
 1  0  1
 2  1  1,
    a  b
 0  3  5
 1  3  5
 2  3  5)

In [30]:
print("df1 + df2:")
print(df1 + df2)

df1 + df2:
   a  b
0  4  5
1  3  6
2  4  6


# To create a Pandas DataFrame with random numbers.

In [31]:
df = pd.DataFrame(np.random.randint(1, 5, size=(5,2)), columns=['a','b'])

In [34]:
df = pd.concat([df, df.agg(['sum', 'mean'])])
df

Unnamed: 0,a,b
0,2.0,1.0
1,2.0,2.0
2,4.0,1.0
3,3.0,2.0
4,1.0,3.0
sum,12.0,9.0
mean,2.4,1.8
sum,26.4,19.8
mean,3.771429,2.828571


# How to combine Pandas DataFrames.

In [35]:
df = pd.DataFrame({'foo1' : np.random.randn(5),'foo2' :np.random.randn(5)})

In [36]:
print("contents of df:")
print(df)

contents of df:
       foo1      foo2
0 -0.135170 -1.881154
1  0.071286 -0.075956
2 -1.179866 -0.714924
3  1.408913  0.643586
4 -0.507195 -0.399507


In [37]:
print("contents of foo1:")
print(df.foo1)

contents of foo1:
0   -0.135170
1    0.071286
2   -1.179866
3    1.408913
4   -0.507195
Name: foo1, dtype: float64


In [38]:
print("contents of foo2:")
print(df.foo2)

contents of foo2:
0   -1.881154
1   -0.075956
2   -0.714924
3    0.643586
4   -0.399507
Name: foo2, dtype: float64
