In [115]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [108]:
#GD를 활용한 LogisticRegression
class LogisticRegression:
    def __init__(self, learning_rate=0.01, threshold=0.01, max_iterations=100000, fit_intercept=True, verbose=False):
        self._learning_rate = learning_rate  # 학습 계수
        self._max_iterations = max_iterations  # 반복 횟수
        self._threshold = threshold  # 학습 중단 계수
        self._fit_intercept = fit_intercept  # 절편 사용 여부를 결정
        self._verbose = verbose  # 중간 진행사항 출력 여부

    # theta(W) 계수들 return
    def get_coeff(self):
        return self._W

    # 절편 추가
    def add_intercept(self, x_data): # intercept. feature 외 변수.
        intercept = np.ones((x_data.shape[0], 1))
        return np.concatenate((intercept, x_data), axis=1)

    # 시그모이드 함수(로지스틱 함수)
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cost(self, h, y):
        ## 1. 어떤뜻인지 파악하기(cost function에 대한 이해필요(GD,SGD 찾아보기)) 
        ## 비용함수(coss function)는 모델 최적화를 위해 최소화하려는 함수이다. 
        ## 이때, Likelihood를 가장 크게 하는 모수를 추정해야 하므로, - log-likelihood는 목적함수와 같다.
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    
    def fit(self, x_data, y_data):
        num_examples, num_features = np.shape(x_data)

        if self._fit_intercept:
            x_data = self.add_intercept(x_data)

        
        self._W = np.zeros(x_data.shape[1])

        for i in range(self._max_iterations):
            z = np.dot(x_data, self._W) # 독립변수(x_data)와 계수(coeff, _W)를 내적한다
            hypothesis = self.sigmoid(z) # 내적한 결과를 시그모이드 함수(로지스틱 함수)에 넣어 음의 무한대와 양의 무한대 사이의 살수에서 0과 1사이의 실수로 변환하여 y_hat(hypothesis)를 도출한다.

            #실제값과 예측값의 차이
            diff = hypothesis - y_data
            cost = self.cost(hypothesis, y_data)

            ## 2.어떤 과정인지 설명하기 
            ## 우도 함수에 로그를 씌운 함수의 음수는 목적함수와 같다. 이때 목적함수를 최소화하는 가중치(모수) w를 구하기 위해서는, 
            ## 목적함수를 w로 미분해야 한다. 그 수식을 정리하면, 아래 gradient 변수에 구현된 코드와 같은 결과로 나온다.
            
            gradient = np.dot(x_data.transpose(), diff) / num_examples
#            print(gradient.shape)
            self._W -= self._learning_rate * gradient

            if cost < self._threshold:
                return False
           
            if (self._verbose == True and i % 100 == 0):
                print('cost :', cost)

    def predict_prob(self, x_data):
        if self._fit_intercept:
            x_data = self.add_intercept(x_data)

        return self.sigmoid(np.dot(x_data, self._W))

    def predict(self, x_data):
        return self.predict_prob(x_data).round()
    ## 3. 왜 라운드 함수를 쓰는지 
    ## 0과 1사이의 실수를 0 혹은 1로 분류하기 위해서다.

In [64]:
!ls

[31mSocial_Network_Ads.csv[m[m     [31mlogistic regression.ipynb[m[m
[31madvertising.csv[m[m            [31mlogistic_tutorial_01.ipynb[m[m
[1m[36mcache[m[m


In [42]:
data_raw = pd.read_csv('Social_Network_Ads.csv')
data_raw.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0
399,15594041,Female,49,36000,1


In [43]:
data_raw.shape

(400, 5)

In [44]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


### preprocessing

```
1. scaling
2. train, text
3. validation curve
```

In [87]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [82]:
data_raw['Male'] = data_raw['Gender'].apply(lambda x : 1 if x == 'Male' else 0)
data_raw['Female'] = data_raw['Gender'].apply(lambda x : 1 if x == 'Female' else 0)

In [83]:
data_raw.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased,Male,Female
395,15691863,Female,46,41000,1,0,1
396,15706071,Male,51,23000,1,1,0
397,15654296,Female,50,20000,1,0,1
398,15755018,Male,36,33000,0,1,0
399,15594041,Female,49,36000,1,0,1


In [84]:
data_revise = data_raw.drop(['User ID', 'Gender'], axis=1)
X_data = data_revise.drop(['Purchased'], axis=1)
y_data = data_revise['Purchased']
# y_data = data_revise.iloc[:, 2:3]
data_revise.tail()

Unnamed: 0,Age,EstimatedSalary,Purchased,Male,Female
395,46,41000,1,0,1
396,51,23000,1,1,0
397,50,20000,1,0,1
398,36,33000,0,1,0
399,49,36000,1,0,1


In [116]:
# scaling

X_scaled = scaler.fit_transform(X_data)
X_data = pd.DataFrame(data=X_scaled, columns=X_data.columns)
X_data.tail()

  return self.partial_fit(X, y)


Unnamed: 0,Age,EstimatedSalary,Male,Female
395,0.666667,0.192593,0.0,1.0
396,0.785714,0.059259,1.0,0.0
397,0.761905,0.037037,0.0,1.0
398,0.428571,0.133333,1.0,0.0
399,0.738095,0.155556,0.0,1.0


In [117]:
X_data.shape, y_data.shape, type(X_data), type(y_data)

((400, 4), (400,), pandas.core.frame.DataFrame, pandas.core.series.Series)

In [118]:
# train, test 분리

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.33, random_state=17)

In [121]:
LR_1 = LogisticRegression(verbose=False, max_iterations=10000)
LR_1.fit(X_train, y_train)
y_pred = LR_1.predict(X_test)
np.mean(y_pred==y_test)

0.8409090909090909