In [1]:
import numpy as np, pandas as pd

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(data = iris.data, columns = iris.feature_names) #define by pd.DataFrame() columns = .feature_names/.columns

# DataFrame Operations

## DataFrame Skimming

In [5]:
df.shape

(150, 4)

In [7]:
df.index

RangeIndex(start=0, stop=150, step=1)

In [8]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [9]:
df.head(), df.tail()

(   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                5.1               3.5                1.4               0.2
 1                4.9               3.0                1.4               0.2
 2                4.7               3.2                1.3               0.2
 3                4.6               3.1                1.5               0.2
 4                5.0               3.6                1.4               0.2,
      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 145                6.7               3.0                5.2               2.3
 146                6.3               2.5                5.0               1.9
 147                6.5               3.0                5.2               2.0
 148                6.2               3.4                5.4               2.3
 149                5.9               3.0                5.1               1.8)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [11]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [15]:
df['petal length (cm)'].unique

<bound method Series.unique of 0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: petal length (cm), Length: 150, dtype: float64>

In [18]:
df['petal length (cm)'].value_counts(ascending=True)

petal length (cm)
3.0     1
6.4     1
3.7     1
6.9     1
3.8     1
1.1     1
1.0     1
3.6     1
6.6     1
6.3     1
5.9     2
4.3     2
5.4     2
3.5     2
5.2     2
6.0     2
5.3     2
1.9     2
1.2     2
6.7     2
3.3     2
3.9     3
6.1     3
5.8     3
4.1     3
5.5     3
5.7     3
4.6     3
4.8     4
5.0     4
4.4     4
4.2     4
1.7     4
4.0     5
4.9     5
4.7     5
5.6     6
1.6     7
1.3     7
4.5     8
5.1     8
1.5    13
1.4    13
Name: count, dtype: int64

## Column Operations

### column naming 

In [28]:
# .columns change : by full list of column names
colnames = list(df.columns)
colnames[3] = 'fdzz'
df_named = df.copy()
df_named.columns = colnames
df_named

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),fdzz
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [30]:
# df.rename() method : dictionary structured renaming
df_named = df.copy()
df_named.rename(columns = {'sepal length (cm)':'sl', 'petal length (cm)':'pl'}, inplace = True)
df_named

Unnamed: 0,sl,sepal width (cm),pl,petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### df[((a column or list of columns))] : column selection by names

In [None]:
#(1 column)
df['sepal length (cm)']

In [None]:
#(2 columns)
df[['sepal length (cm)', 'sepal width (cm)']]

### df.iloc[:, ((indices))] : column selection by indices

In [None]:
df.iloc[:, 0]

In [None]:
#df.iloc[:, ['sepal length (cm)', 'sepal width (cm)']] : 사용불가
df.iloc[:, [0, 1]]

### df.apply(func) : create new columns by mutating existing ones

In [None]:
df['len'] = df['petal length (cm)'].apply(round)
df

### df.drop() to drop unusing columns

In [20]:
df2 = pd.read_csv('Data/PimaIndiansDiabetes2.csv')
df2

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,6,148.0,72.0,35.0,,33.6,0.627,50,pos
1,1,85.0,66.0,29.0,,26.6,0.351,31,neg
2,8,183.0,64.0,,,23.3,0.672,32,pos
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,neg
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,pos
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,neg
764,2,122.0,70.0,27.0,,36.8,0.340,27,neg
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,neg
766,1,126.0,60.0,,,30.1,0.349,47,pos


In [21]:
df2.drop(['pregnant'], axis = 1)

Unnamed: 0,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,148.0,72.0,35.0,,33.6,0.627,50,pos
1,85.0,66.0,29.0,,26.6,0.351,31,neg
2,183.0,64.0,,,23.3,0.672,32,pos
3,89.0,66.0,23.0,94.0,28.1,0.167,21,neg
4,137.0,40.0,35.0,168.0,43.1,2.288,33,pos
...,...,...,...,...,...,...,...,...
763,101.0,76.0,48.0,180.0,32.9,0.171,63,neg
764,122.0,70.0,27.0,,36.8,0.340,27,neg
765,121.0,72.0,23.0,112.0,26.2,0.245,30,neg
766,126.0,60.0,,,30.1,0.349,47,pos


## Row Operations

### df[df[column] == True] : filtering rows by conditions

In [None]:
df[df['sepal length (cm)'] == 0]

### df.iloc[,:] : filtering by indices

In [None]:
df.iloc[0:50, :]

### df.sort_values() : arrange by row rank

In [17]:
df.sort_values()

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

## Grouped Data analysis

## DataFrames into 1 DataFrame

In [None]:
dfm1 = pd.DataFrame({'k':[10, 11],'a':[1,2], 'b':[3,4]})
dfm2 = pd.DataFrame({'k':[10, 11], 'c':[5,6], 'd':[7,8]})

### Data pasting by concat()

In [None]:
#행결합 
dfm3 = pd.concat([dfa, dfb], axis = 0) #axis 불필요
dfm3

In [None]:
#열결합 
dfm3 = pd.concat([dfa, dfb], axis=1)
dfm3

### Data joining by merge()

In [None]:
dfm3 = pd.merge(dfa, dfb, 'inner', 'k')
dfm3

In [1]:
# 2 key columns

구조 : (df1, df2, methods, key column)

methods = 'inner', 'left', 'right', 'outer'