# Exercise with real data

In [5]:
import pandas as pd
import numpy as np

## Before real exe,

### Partitioning a dataset into separate training and test sets

In [3]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'+
                      'machine-learning-databases/wine/wine.data', header=None)

In [4]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
                   'Alcalinity of ash', 'Magnesium', 'Total phenols', 
                   'Flavanoids', 'Nonflavanoid phenols', 
                   'Proanthocyanins', 'Color intensity', 'Hue', 
                   'OD280/OD315 of diluted wines', 'Proline']

In [6]:
np.unique(df_wine['Class label'])

array([1, 2, 3])

In [7]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [8]:
from sklearn.model_selection import train_test_split

In [11]:
X = df_wine.iloc[:, 1:].values

In [13]:
y = df_wine.iloc[:, 0].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.3,
                                                   random_state = 0,
                                                   stratify = y)

In [16]:
X_train.shape

(124, 13)

In [17]:
X_test.shape

(54, 13)

### Bringing features onto the sample scale

In [18]:
ex = np.array([0,1,2,3,4,5])

In [22]:
#MinMaxScaler by calculating without library
(ex-ex.min())/(ex.max()-ex.min())

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
data = [
    [-1,2],
    [-0.5,6],
    [0,10],
    [1,18]
]

In [25]:
mms = MinMaxScaler()

In [26]:
mms.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [27]:
from sklearn.preprocessing import StandardScaler

In [28]:
stds = StandardScaler()

In [29]:
stds.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

# A complete example

### KNN Classifier

In [32]:
import pandas as pd

In [33]:
training = pd.read_csv('loan_training.csv')

In [34]:
test = pd.read_csv('loan_test.csv')

In [35]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,0,Graduate,No,4950,0.0,125,360,1,Urban,Y
1,LP001824,Male,Yes,1,Graduate,No,2882,1843.0,123,480,1,Semiurban,Y
2,LP002928,Male,Yes,0,Graduate,No,3000,3416.0,56,180,1,Semiurban,Y
3,LP001814,Male,Yes,2,Graduate,No,9703,0.0,112,360,1,Urban,Y
4,LP002244,Male,Yes,0,Graduate,No,2333,2417.0,136,360,1,Urban,Y


In [36]:
#KNN without preprocessing
from sklearn.neighbors import KNeighborsClassifier

categorical error

In [37]:
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 
                    'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [38]:
knn = KNeighborsClassifier()

In [39]:
#fitting == learn parameter

In [40]:
knn.fit(training[numeric_features], training['Target'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [41]:
#evaluation

In [42]:
from sklearn.metrics import accuracy_score

In [43]:
accuracy_score(test['Target'], #정답set
               knn.predict(test[numeric_features])) 

0.6145833333333334

### Handling categorical value

In [44]:
dependents_mapping = {
    '3+':4,
    '2':3,
    '1':2,
    '0':1
}

In [45]:
training['Dependents'] = training['Dependents'].map(dependents_mapping)

In [46]:
test['Dependents'] = test['Dependents'].map(dependents_mapping)

In [47]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target
0,LP001032,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,Y
1,LP001824,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,Y
2,LP002928,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,Y
3,LP001814,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,Y
4,LP002244,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,Y


### Encoding class labels

In [49]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()

In [50]:
class_le.fit(training['Target'].values)

LabelEncoder()

In [51]:
training['Target'] = class_le.transform(training['Target'].values)

In [52]:
test['Target'] = class_le.transform(test['Target'].values)

### Performing one-hot encoding on nominal features

In [54]:
nominal_features = ['Gender','Married','Education','Self_Employed','Property_Area']

In [55]:
training_dummies = pd.get_dummies(training[nominal_features], drop_first=True) #drop first column

In [56]:
training_dummies.head()

Unnamed: 0,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,0,0,1
1,1,1,0,0,1,0
2,1,1,0,0,1,0
3,1,1,0,0,0,1
4,1,1,0,0,0,1


In [57]:
training = pd.concat([training, training_dummies], axis=1)

In [58]:
training.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001032,Male,No,1,Graduate,No,4950,0.0,125,360,1,Urban,1,1,0,0,0,0,1
1,LP001824,Male,Yes,2,Graduate,No,2882,1843.0,123,480,1,Semiurban,1,1,1,0,0,1,0
2,LP002928,Male,Yes,1,Graduate,No,3000,3416.0,56,180,1,Semiurban,1,1,1,0,0,1,0
3,LP001814,Male,Yes,3,Graduate,No,9703,0.0,112,360,1,Urban,1,1,1,0,0,0,1
4,LP002244,Male,Yes,1,Graduate,No,2333,2417.0,136,360,1,Urban,1,1,1,0,0,0,1


In [59]:
training = training.drop(columns = nominal_features)

In [60]:
training.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001032,1,4950,0.0,125,360,1,1,1,0,0,0,0,1
1,LP001824,2,2882,1843.0,123,480,1,1,1,1,0,0,1,0
2,LP002928,1,3000,3416.0,56,180,1,1,1,1,0,0,1,0
3,LP001814,3,9703,0.0,112,360,1,1,1,1,0,0,0,1
4,LP002244,1,2333,2417.0,136,360,1,1,1,1,0,0,0,1


In [61]:
test_dummies = pd.get_dummies(test[nominal_features], drop_first=True)
test=pd.concat([test,test_dummies],axis=1)
test=test.drop(columns=nominal_features)

In [62]:
test.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Target,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP002684,1,3400,0,95,360,1,0,0,0,1,0,0,0
1,LP001907,1,14583,0,436,360,1,1,1,1,0,0,1,0
2,LP001205,1,2500,3796,120,360,1,1,1,1,0,0,0,1
3,LP001275,2,3988,0,50,240,1,1,1,1,0,0,0,1
4,LP002455,3,3859,0,96,360,1,1,1,1,0,0,1,0


### Scaling

In [69]:
from sklearn.preprocessing import MinMaxScaler

In [70]:
x_training = training.drop(columns = ['Loan_ID', 'Target'])
x_test = test.drop(columns = ['Loan_ID', 'Target'])

In [71]:
y_training = training['Target']
y_test = test['Target']

In [72]:
min_max = MinMaxScaler()

In [73]:
min_max.fit(x_training)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [76]:
training_minmax = min_max.transform(x_training)

In [80]:
test_minmax = min_max.transform(x_test)    ############ test set의 data를 train set에서처럼 handling하지않아 생기는 오류

In [81]:
########cross validation은 데이터 다같이, 지금 경우는 dynamic하므로 train에서 한거를 test에서 사용

In [82]:
knn.fit(training_minmax, y_training)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [83]:
accuracy_score(y_test, knn.predict(test_minmax))

0.6875