## Machine Leaning 실습

### Regression With Python

In [1]:
# 자동완성기능
%config Completer.use_jedi = False

In [2]:
# 1. 패키지 호출
import numpy as np
import pandas as pd


In [3]:
# 2. 데이터 with pandas DataFrame
df = pd.read_csv('./01SR_Data.csv')

In [4]:
# 3-1. 데이터 살펴보기
df.head(3)

Unnamed: 0,Country,Age,Year,Salary
0,Spain,27.0,3.0,48000
1,Spain,,6.0,52000
2,Germany,30.0,2.0,54000


In [5]:
# 4-2. 데이터 정보확인
# 문자열은 object라고 나온다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  10 non-null     object 
 1   Age      9 non-null      float64
 2   Year     7 non-null      float64
 3   Salary   10 non-null     int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 448.0+ bytes


In [6]:
# 3-3. 데이터 설명보기
# include = 'all'로 전체 확인이 가능하다.
df.describe(include = 'all')

Unnamed: 0,Country,Age,Year,Salary
count,10,9.0,7.0,10.0
unique,3,,,
top,France,,,
freq,4,,,
mean,,38.777778,9.142857,63500.0
std,,7.693793,6.817345,11597.413505
min,,27.0,2.0,48000.0
25%,,35.0,4.5,55000.0
50%,,38.0,7.0,61000.0
75%,,44.0,12.5,70750.0


In [7]:
# 4. feature/label 나누기
feature = df.iloc[:,:-1]
label = df.iloc[:,-1]
feature.head()

Unnamed: 0,Country,Age,Year
0,Spain,27.0,3.0
1,Spain,,6.0
2,Germany,30.0,2.0
3,France,35.0,
4,Spain,38.0,


In [8]:
# 5. 비어있는 값 채우기(mean) 나의 방식
# feature.isnull().sum()
# feature['Age'].fillna(feature['Age'].mean, inplace = True)
# feature['Age'].isnull().sum()
# feature['Year'].fillna(feature['Year'].mean, inplace = True)
# feature['Year'].isnull().sum()
# feature.isnull().sum()

In [9]:
# 5-1. 비어있는 값 채우기(mean) 강사님
from sklearn.impute import SimpleImputer

## uodate, caculation
# 빠진 값을 평균으로 넣어주는 mean_imputer를 생성
mean_imputer = SimpleImputer(strategy = 'mean')

## 데이터 변환
# 넣어줄 데이터를 보여주어야 하므로 fit을 사용
# mean으로 할 값을 지정해주기(행은 모두 선택하고 설정해주기)
mean_imputer.fit(feature.iloc[:, 1:])
feature.iloc[:,1:] = mean_imputer.transform(feature.iloc[:, 1:])
feature.isnull().sum()

Country    0
Age        0
Year       0
dtype: int64

In [10]:
# 5-2. 값 확인하기
feature.describe()

Unnamed: 0,Age,Year
count,10.0,10.0
mean,38.777778,9.142857
std,7.253777,5.566339
min,27.0,2.0
25%,35.5,6.25
50%,38.388889,9.142857
75%,43.0,9.785714
max,50.0,21.0


In [11]:
# 6. One hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# remainder ='passthrough' : 지정하지않은 나머지들을 그냥 가져온다.
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [0])], 
                      remainder ='passthrough')

# 기능을 사용하기
feature = ct.fit_transform(feature)
print(feature)

[[ 0.          0.          1.         27.          3.        ]
 [ 0.          0.          1.         38.77777778  6.        ]
 [ 0.          1.          0.         30.          2.        ]
 [ 1.          0.          0.         35.          9.14285714]
 [ 0.          0.          1.         38.          9.14285714]
 [ 0.          1.          0.         40.         10.        ]
 [ 1.          0.          0.         37.          7.        ]
 [ 1.          0.          0.         44.         15.        ]
 [ 1.          0.          0.         48.          9.14285714]
 [ 0.          1.          0.         50.         21.        ]]


In [12]:
# 7. Split Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state = 180)
print(X_train)
print(y_train)

[[ 0.          1.          0.         30.          2.        ]
 [ 1.          0.          0.         44.         15.        ]
 [ 0.          1.          0.         50.         21.        ]
 [ 1.          0.          0.         48.          9.14285714]
 [ 0.          1.          0.         40.         10.        ]
 [ 1.          0.          0.         35.          9.14285714]
 [ 1.          0.          0.         37.          7.        ]
 [ 0.          0.          1.         38.77777778  6.        ]]
2    54000
7    72000
9    83000
8    79000
5    61000
3    58000
6    67000
1    52000
Name: Salary, dtype: int64


In [13]:
# 8. Train

from sklearn.linear_model import LinearRegression

# linear_model = LinearRegression(fit_intercept) = False 레이블값이 0으로 되는 값이면 false로 진행하면 된다.
# y = wx+b bias
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


LinearRegression()

In [14]:
# 8-1. Train_2

from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor().fit(X_train, y_train)

In [15]:
# 9. Score
# predict

# Linear 모델로 테스트 데이터를 예측해보기
y_pred = linear_model.predict(X_test)
print(y_pred)
print(y_test)

[34855.43405324 50666.69395455]
0    48000
4    61000
Name: Salary, dtype: int64


In [16]:
# 10. Evalute(Linear)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae_l = mean_absolute_error(y_test, y_pred)
mse_l = mean_squared_error(y_test, y_pred, squared = False)
r2_l = r2_score(y_test, y_pred)

print(mae_l)
print(mse_l)
print(r2_l)

11738.935996104708
11822.792135482256
-2.308364825534227


In [17]:
y_pred_tree = tree_model.predict(X_test)

In [18]:
# 10. Evalute(Decision)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae_t = mean_absolute_error(y_test, y_pred_tree)
mse_t = mean_squared_error(y_test, y_pred_tree, squared = False)
r2_t = r2_score(y_test, y_pred_tree)

print(mae_t)
print(mse_t)
print(r2_t)

2000.0
2828.42712474619
0.8106508875739645
