web leaning : https://realpython.com/pandas-dataframe/

# Introducing the Pandas Dataframe

In [64]:
import pandas as pd

In [65]:
# Make a dict incude data
data = {
    'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
    'city': ['Mexico City', 'Toronto', 'Prague', 'Shanghai','Manchester', 'Cairo', 'Osaka'],
    'age': [41, 28, 33, 34, 38, 31, 37],
    'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0]
}
#Make row labels
row_labels = [101, 102, 103, 104, 105, 106, 107]

In [66]:
#Make a DataFrame
df=pd.DataFrame(data=data,index=row_labels)
df

Unnamed: 0,name,city,age,py-score
101,Xavier,Mexico City,41,88.0
102,Ann,Toronto,28,79.0
103,Jana,Prague,33,81.0
104,Yi,Shanghai,34,80.0
105,Robin,Manchester,38,68.0
106,Amal,Cairo,31,61.0
107,Nori,Osaka,37,84.0


In [67]:
#Can use .head() to show the first few items and .tail() to show the last few items, n = number of rows
df.head(n=2)
df.tail(n=2)

Unnamed: 0,name,city,age,py-score
106,Amal,Cairo,31,61.0
107,Nori,Osaka,37,84.0


In [68]:
# can access a column in a Pandas DataFrame
cities=df['city']
cities

101    Mexico City
102        Toronto
103         Prague
104       Shanghai
105     Manchester
106          Cairo
107          Osaka
Name: city, dtype: object

In [69]:
#Each column of a Pandas DataFrame is an instance of pandas.Series
#You can get a single item of a Series object the same way you would with a dictionary
cities[102]

'Toronto'

In [70]:
#You can also access a whole row with .loc[]:
df.loc[103]

name          Jana
city        Prague
age             33
py-score      81.0
Name: 103, dtype: object

# Creating a Pandas DataFrame

In [71]:
import numpy as np

Creating a Pandas DataFrame With Dictionaries

In [72]:
d={'x':[1,2,3],'y':np.array([2,5,8]),'z':100}
pd.DataFrame(data=d)

Unnamed: 0,x,y,z
0,1,2,100
1,2,5,100
2,3,8,100


In [73]:
#It’s possible to control the order of the columns with the columns parameter and the row labels with index:
pd.DataFrame(data=d,index=(100,200,300),columns=['z','x','y'])

Unnamed: 0,z,x,y
100,100,1,2
200,100,2,5
300,100,3,8


Creating a Pandas DataFrame With Lists

In [74]:
#use a list of dictionaries
l=[{'x':1,'y':2,'z':300},
   {'x':1,'y':4,'z':300},
   {'x':1,'y':2,'z':300},
  ]

In [75]:
pd.DataFrame(data=l)

Unnamed: 0,x,y,z
0,1,2,300
1,1,4,300
2,1,2,300


Creating a Pandas DataFrame With NumPy Arrays

In [76]:
#can pass a two-dimensional NumPy array to the DataFrame constructor

In [77]:
arr = np.array([[1, 2, 100],
                [2, 4, 100],
                [3, 8, 100]])

In [78]:
df_=pd.DataFrame(arr,columns=['x','y','x'])
df_

Unnamed: 0,x,y,x.1
0,1,2,100
1,2,4,100
2,3,8,100


In [79]:
#When copy is set to False (its default setting),If you modify the array, then your DataFrame will change too
arr[0, 0] = 1000
df_

Unnamed: 0,x,y,x.1
0,1000,2,100
1,2,4,100
2,3,8,100


# Retrieving Labels and Data

Pandas DataFrame Labels as Sequences

In [80]:
#can get the DataFrame’s row labels with .index and its column labels with .columns:
#labels of rows
df.index

Int64Index([101, 102, 103, 104, 105, 106, 107], dtype='int64')

In [81]:
#labels of columns
df.columns

Index(['name', 'city', 'age', 'py-score'], dtype='object')

In [82]:
#As you can with any other Python sequence, you can get a single item
df.index[1]
df.columns[1:3]

Index(['city', 'age'], dtype='object')

In [83]:
#can also modify the labels (if you try to modify a particular item of .index or .columns, then you’ll get a TypeError)
df.index=np.arange(10,17)
df

Unnamed: 0,name,city,age,py-score
10,Xavier,Mexico City,41,88.0
11,Ann,Toronto,28,79.0
12,Jana,Prague,33,81.0
13,Yi,Shanghai,34,80.0
14,Robin,Manchester,38,68.0
15,Amal,Cairo,31,61.0
16,Nori,Osaka,37,84.0


Data as NumPy Arrays

In [84]:
# extract data from a Pandas DataFrame without its labels
df.to_numpy()

array([['Xavier', 'Mexico City', 41, 88.0],
       ['Ann', 'Toronto', 28, 79.0],
       ['Jana', 'Prague', 33, 81.0],
       ['Yi', 'Shanghai', 34, 80.0],
       ['Robin', 'Manchester', 38, 68.0],
       ['Amal', 'Cairo', 31, 61.0],
       ['Nori', 'Osaka', 37, 84.0]], dtype=object)

Data Types

In [85]:
#can get the data types for each column of a Pandas DataFrame with .dtypes:
df.dtypes

name         object
city         object
age           int64
py-score    float64
dtype: object

In [86]:
#If you want to modify the data type of one or more columns, then you can use .astype():
df_=df.astype(dtype={'age':np.int32,'py-score':np.float32})
df_.dtypes

name         object
city         object
age           int32
py-score    float32
dtype: object

Pandas DataFrame Size

In [87]:
#The attributes .ndim, .size, and .shape

# Accessing and Modifying Data

Getting Data With Accessors

In [88]:
#the accessor .iloc[], which retrieves a row or column by its integer index
df.loc[10]

name             Xavier
city        Mexico City
age                  41
py-score           88.0
Name: 10, dtype: object

In [89]:
df.iloc[0]

name             Xavier
city        Mexico City
age                  41
py-score           88.0
Name: 10, dtype: object

In [90]:
#can use .loc and .iloc to access a column
df.loc[:,'city']

10    Mexico City
11        Toronto
12         Prague
13       Shanghai
14     Manchester
15          Cairo
16          Osaka
Name: city, dtype: object

In [91]:
df.iloc[:,1]

10    Mexico City
11        Toronto
12         Prague
13       Shanghai
14     Manchester
15          Cairo
16          Osaka
Name: city, dtype: object

In [92]:
#you can provide slices along with lists or arrays instead of indices to get multiple rows or columns:
#can use .loc and .iloc to access n columns and rows
df.loc[12:15,['city','name']]

Unnamed: 0,city,name
12,Prague,Jana
13,Shanghai,Yi
14,Manchester,Robin
15,Cairo,Amal


In [93]:
df.iloc[2:6,[0,1]]

Unnamed: 0,name,city
12,Jana,Prague
13,Yi,Shanghai
14,Robin,Manchester
15,Amal,Cairo


In [94]:
#You can skip rows and columns with .iloc[] using construct
df.iloc[1:6:2,0] # start:stop:step,index of column

11     Ann
13      Yi
15    Amal
Name: name, dtype: object

In [95]:
#you could also use the built-in Python class slice(), as well as numpy.s_[] or pd.IndexSlice[]
df.iloc[slice(1,6,2),0]

11     Ann
13      Yi
15    Amal
Name: name, dtype: object

In [96]:
#when you need only a single value. You use the specialized accessors .at[] and .iat[]
df.at[12,'name']

'Jana'

In [97]:
df.iat[2,0]

'Jana'

Setting Data With Accessors

In [98]:
df.loc[:,'py-score']

10    88.0
11    79.0
12    81.0
13    80.0
14    68.0
15    61.0
16    84.0
Name: py-score, dtype: float64

In [99]:
#You can use accessors to modify parts of a Pandas DataFrame 
df.loc[:13,'py-score']=[40,50,60,70]

In [100]:
df.loc[14:,'py-score']=0

In [101]:
df['py-score']

10    40.0
11    50.0
12    60.0
13    70.0
14     0.0
15     0.0
16     0.0
Name: py-score, dtype: float64

In [102]:
# You can do the same with .iloc[]
df.iloc[:,-1]=np.arange(80,87,1)

In [103]:
df['py-score']

10    80
11    81
12    82
13    83
14    84
15    85
16    86
Name: py-score, dtype: int32

# Inserting and Deleting Data

Inserting and Deleting Rows

In [104]:
#making a new Series object
john=pd.Series(data=['John','Boston',37,79],index=df.columns,name=17)

In [105]:
john

name          John
city        Boston
age             37
py-score        79
Name: 17, dtype: object

In [106]:
john.name

17

In [107]:
#You can add john as a new row to the end of df with .append():
df=df.append(john)

In [108]:
df

Unnamed: 0,name,city,age,py-score
10,Xavier,Mexico City,41,80
11,Ann,Toronto,28,81
12,Jana,Prague,33,82
13,Yi,Shanghai,34,83
14,Robin,Manchester,38,84
15,Amal,Cairo,31,85
16,Nori,Osaka,37,86
17,John,Boston,37,79


In [111]:
#you can delete it with a single call to .drop():
df=df.drop(labels=[17])

In [112]:
df

Unnamed: 0,name,city,age,py-score
10,Xavier,Mexico City,41,80
11,Ann,Toronto,28,81
12,Jana,Prague,33,82
13,Yi,Shanghai,34,83
14,Robin,Manchester,38,84
15,Amal,Cairo,31,85
16,Nori,Osaka,37,86


Inserting and Deleting Columns

In [114]:
# Insert a column
df['js-score']=np.array([71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0])
df

Unnamed: 0,name,city,age,py-score,js-score
10,Xavier,Mexico City,41,80,71.0
11,Ann,Toronto,28,81,95.0
12,Jana,Prague,33,82,88.0
13,Yi,Shanghai,34,83,79.0
14,Robin,Manchester,38,84,91.0
15,Amal,Cairo,31,85,91.0
16,Nori,Osaka,37,86,80.0


In [115]:
df['total-score']=0.0
df

Unnamed: 0,name,city,age,py-score,js-score,total-score
10,Xavier,Mexico City,41,80,71.0,0.0
11,Ann,Toronto,28,81,95.0,0.0
12,Jana,Prague,33,82,88.0,0.0
13,Yi,Shanghai,34,83,79.0,0.0
14,Robin,Manchester,38,84,91.0,0.0
15,Amal,Cairo,31,85,91.0,0.0
16,Nori,Osaka,37,86,80.0,0.0


In [116]:
# You can use .insert to allow you to specify the location of the new column
df.insert(loc=4,column='django-score'
         ,value=np.array([86.0, 81.0, 78.0, 88.0, 74.0, 70.0, 81.0]))
df

Unnamed: 0,name,city,age,py-score,django-score,js-score,total-score
10,Xavier,Mexico City,41,80,86.0,71.0,0.0
11,Ann,Toronto,28,81,81.0,95.0,0.0
12,Jana,Prague,33,82,78.0,88.0,0.0
13,Yi,Shanghai,34,83,88.0,79.0,0.0
14,Robin,Manchester,38,84,74.0,91.0,0.0
15,Amal,Cairo,31,85,70.0,91.0,0.0
16,Nori,Osaka,37,86,81.0,80.0,0.0


In [117]:
#delect column
del df['total-score']
#You can also remove one or more columns with .drop()
df = df.drop(labels='age', axis=1)
df

Unnamed: 0,name,city,py-score,django-score,js-score
10,Xavier,Mexico City,80,86.0,71.0
11,Ann,Toronto,81,81.0,95.0
12,Jana,Prague,82,78.0,88.0
13,Yi,Shanghai,83,88.0,79.0
14,Robin,Manchester,84,74.0,91.0
15,Amal,Cairo,85,70.0,91.0
16,Nori,Osaka,86,81.0,80.0
