# Introduction

Now we have learned Numpy as a package to deal with multi-dimensional data. If we are going to deal with tabular data, we will need Pandas, in particular, the DataFrame.

Let's get started!

In [1]:
import numpy as np
import pandas as pd

# Creation

## 99% of time you are going to create a dataframe from a database (a csv, excel, or a json file etc). We here create a dataframe for demonstration purpose.

In [2]:
df = pd.DataFrame({
    'Item':['Apple', "Orange", 'Banana', 'Watermelon'],
    'Quantity':[12, 10, 100, 3],
    'Price':[1, 2, 0.5, 9],
})

In [3]:
df

Unnamed: 0,Item,Quantity,Price
0,Apple,12,1.0
1,Orange,10,2.0
2,Banana,100,0.5
3,Watermelon,3,9.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Item      4 non-null      object 
 1   Quantity  4 non-null      int64  
 2   Price     4 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 228.0+ bytes


In [5]:
df.columns

Index(['Item', 'Quantity', 'Price'], dtype='object')

In [6]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
df.size

12

In [8]:
df.shape

(4, 3)

In [9]:
df.dtypes

Unnamed: 0,0
Item,object
Quantity,int64
Price,float64


# Accessing

In [10]:
df['Item']

Unnamed: 0,Item
0,Apple
1,Orange
2,Banana
3,Watermelon


In [11]:
df['Price']

Unnamed: 0,Price
0,1.0
1,2.0
2,0.5
3,9.0


In [12]:
df['Quantity']

Unnamed: 0,Quantity
0,12
1,10
2,100
3,3


In [13]:
df['Item'][0]

'Apple'

In [14]:
df['Price'][3]

np.float64(9.0)

In [15]:
df.loc[0]

Unnamed: 0,0
Item,Apple
Quantity,12
Price,1.0


In [16]:
df.loc[2]

Unnamed: 0,2
Item,Banana
Quantity,100
Price,0.5


#Selection

In [17]:
df['Price'] > 5

Unnamed: 0,Price
0,False
1,False
2,False
3,True


In [18]:
df[df['Price'] > 5]

Unnamed: 0,Item,Quantity,Price
3,Watermelon,3,9.0


In [19]:
df['Quantity'] < 20

Unnamed: 0,Quantity
0,True
1,True
2,False
3,True


In [20]:
df[df['Quantity'] < 20]

Unnamed: 0,Item,Quantity,Price
0,Apple,12,1.0
1,Orange,10,2.0
3,Watermelon,3,9.0


In [21]:
result = df['Price'] < 5
result

Unnamed: 0,Price
0,True
1,True
2,True
3,False


In [22]:
df[result]

Unnamed: 0,Item,Quantity,Price
0,Apple,12,1.0
1,Orange,10,2.0
2,Banana,100,0.5



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



#Adding

In [23]:
df['Total'] = df['Price'] * df['Quantity']
df

Unnamed: 0,Item,Quantity,Price,Total
0,Apple,12,1.0,12.0
1,Orange,10,2.0,20.0
2,Banana,100,0.5,50.0
3,Watermelon,3,9.0,27.0


#Statistics

In [24]:
total = df['Total']
total

Unnamed: 0,Total
0,12.0
1,20.0
2,50.0
3,27.0


In [25]:
total.min(), total.max()

(12.0, 50.0)

In [26]:
total.mean()

np.float64(27.25)

In [27]:
total.std()

16.357974609753292

In [28]:
total.var()

267.5833333333333

In [29]:
total.median()

23.5

In [30]:
total.quantile(.25)

np.float64(18.0)

In [31]:
total.quantile([.25, .50, .75])

Unnamed: 0,Total
0.25,18.0
0.5,23.5
0.75,32.75


In [32]:
df.describe()

Unnamed: 0,Quantity,Price,Total
count,4.0,4.0,4.0
mean,31.25,3.125,27.25
std,45.995471,3.966001,16.357975
min,3.0,0.5,12.0
25%,8.25,0.875,18.0
50%,11.0,1.5,23.5
75%,34.0,3.75,32.75
max,100.0,9.0,50.0


# Load from a csv file

In [33]:
file = '/content/sample_data/california_housing_test.csv'
df = pd.read_csv(file)

In [34]:
df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
5,-119.56,36.51,37.0,1018.0,213.0,663.0,204.0,1.6635,67000.0
6,-121.43,38.63,43.0,1009.0,225.0,604.0,218.0,1.6641,67000.0
7,-120.65,35.48,19.0,2310.0,471.0,1341.0,441.0,3.225,166900.0
8,-122.84,38.4,15.0,3080.0,617.0,1446.0,599.0,3.6696,194400.0
9,-118.02,34.08,31.0,2402.0,632.0,2830.0,603.0,2.3333,164200.0


In [35]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205846.275
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113119.68747
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121200.0
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177650.0
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263975.0
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500001.0


In [None]:
# play with this dataframe!

In [36]:
file = '/content/sample_data/mnist_train_small.csv'
df = pd.read_csv(file)

In [37]:
df

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df.head(10)

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df.columns

Index(['6', '0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8',
       ...
       '0.581', '0.582', '0.583', '0.584', '0.585', '0.586', '0.587', '0.588',
       '0.589', '0.590'],
      dtype='object', length=785)

In [40]:
df.index

RangeIndex(start=0, stop=19999, step=1)

In [41]:
df.describe()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
count,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,...,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0
mean,4.470124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.215011,0.087704,0.036502,0.013651,0.032602,0.006,0.0,0.0,0.0,0.0
std,2.892807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.314821,3.921664,2.712527,0.950818,2.718102,0.600333,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,253.0,254.0,253.0,79.0,254.0,62.0,0.0,0.0,0.0,0.0
