In [1]:
import pandas as pd
from io import StringIO # 문자열을 파일처럼 사용할 수 있도록 지원

In [2]:
csv_data = """A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,,8.0
    9.0,10.0,11.0,
"""
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,


In [12]:
# df.isnull()
# df.isna()
# df.notnull()
# df.notna()
# df.isnull().sum(axis=1) # NA 데이터 포함 여부 (NA → 1)
# df.isnull().sum(axis=0)
# df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
 2   C       2 non-null      float64
 3   D       2 non-null      float64
dtypes: float64(4)
memory usage: 224.0 bytes


In [18]:
# NaN 제거
# df.dropna() # NaN이 포함된 모든 행 제거
# df.dropna(axis=1) # NaN이 포함된 모든 열 제거
# df.dropna(how="all") # 모든 데이터가 NaN인 행 제거 (any or all)
# df.dropna(thresh=4) # NaN이 아닌 데이터가 4개 미만인 행 제거
# df.dropna(subset=['C']) # C 컬럼에 NaN이 포함된 행 제거

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,11.0,


In [19]:
from sklearn.impute import SimpleImputer
import numpy as np

In [20]:
simr = SimpleImputer(missing_values=np.nan, 
                     strategy="constant",    # strategy : mean, most_frequent, median
                     fill_value=100)
simr = simr.fit(df)
imputed_data = simr.transform(df.values)
imputed_data

array([[  1.,   2.,   3.,   4.],
       [  5.,   6., 100.,   8.],
       [  9.,  10.,  11., 100.]])

In [24]:
df2 = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1'],
])

df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [26]:
# np.unique(df2['classlabel'])

# class_mapping = { 'class1' : 0, 'class2' : 1, 'class3' : 2 }

class_mapping = {
    label : idx for idx, label in enumerate(np.unique(df2['classlabel']))
}

df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


In [28]:
inv_class_mapping = { v : k for k, v in class_mapping.items() }
print(inv_class_mapping)

df2['classlabel'] = df2['classlabel'].map(inv_class_mapping)
df2

{0: 'class1', 1: 'class2'}


Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [30]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()

# y = class_le.fit_transform(df2['classlabel'].values)
# y
df2['classlabel'] = class_le.fit_transform(df2['classlabel'].values)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


In [31]:
df2['classlabel'] = class_le.inverse_transform(df2['classlabel'].values)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [40]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

X = df2[['color']].values
print(X)

color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
print(X)

X2 = df2[['color']].values
ohe = OneHotEncoder() # 범주에 포함된 값의 종류(갯수) 만큼 컬럼을 만들고 각 행의 해당하는 위치의 값만 1로 나머지는 0으로 인코딩 
encoded = ohe.fit_transform(X2).toarray()
print(encoded)

[['green']
 ['red']
 ['blue']]
[[1]
 [2]
 [0]]
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [41]:
pd.get_dummies(df2[['price', 'color']])

Unnamed: 0,price,color_blue,color_green,color_red
0,10.1,0,1,0
1,13.5,0,0,1
2,15.3,1,0,0


In [47]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()
df_wine.shape

Class labels [1 2 3]


(178, 14)

In [45]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((124, 13), (54, 13), (124,), (54,))