In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import cross_val_score, KFold
from sklearn import preprocessing as prep
from sklearn import utils

In [0]:
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
data = pd.read_csv(dataset_url,  header=None)

In [119]:
feature_names_raw = """1) Alcohol
2) Malic acid
3) Ash
4) Alcalinity of ash
5) Magnesium
6) Total phenols
7) Flavanoids
8) Nonflavanoid phenols
9) Proanthocyanins
10)Color intensity
11)Hue
12)OD280/OD315 of diluted wines
13)Proline"""
feature_names = ['Class']
for afeat in feature_names_raw.split('\n'):
    feature_names.append(afeat.replace(") ",")").split(")")[1])

feature_names

['Class',
 'Alcohol',
 'Malic acid',
 'Ash',
 'Alcalinity of ash',
 'Magnesium',
 'Total phenols',
 'Flavanoids',
 'Nonflavanoid phenols',
 'Proanthocyanins',
 'Color intensity',
 'Hue',
 'OD280/OD315 of diluted wines',
 'Proline']

In [120]:
data.columns = feature_names
data

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [0]:
def get_outliers_count(data, outliers_level = 2):
    feature_names = data.columns
    params = {afeat:{'median':data[afeat].median(),'count':0} for afeat in feature_names}
    for afeat in feature_names:
        params[afeat]['count'] = data[afeat].apply(lambda x: 0 if x < params[afeat]['median']*outliers_level else 1).sum()
        print(afeat, params[afeat]['count'])


In [122]:
get_outliers_count(data, 2)

Class 0
Alcohol 0
Malic acid 28
Ash 0
Alcalinity of ash 0
Magnesium 0
Total phenols 0
Flavanoids 1
Nonflavanoid phenols 0
Proanthocyanins 2
Color intensity 11
Hue 0
OD280/OD315 of diluted wines 0
Proline 7


In [123]:
X = data.drop(['Class'], axis=1)
y= data.Class
X.shape, y.shape

((178, 13), (178,))

In [124]:
from sklearn.neighbors import KNeighborsClassifier
clf  = KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [0]:
# from sklearn.model_selection import cross_val_score, KFold

clf_kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)


scores = cross_val_score(clf, X, y, cv=KFold(n_splits = 5, shuffle = True, random_state = 42))

In [0]:
def get_cross_val_scores_knn(n_neighbors):
    scores = {}
    for neighbors in range(1,n_neighbors+1):
        clf  = KNeighborsClassifier(n_neighbors=neighbors)
        clf.fit(X, y)
        scores[neighbors] = cross_val_score(clf, X, y, cv=KFold(n_splits = 5, shuffle = True, random_state = 42))
    return scores

In [0]:
scores = get_cross_val_scores_knn(50)
df_scores = pd.DataFrame(scores).T.mean(axis=1)

In [128]:
df_scores.idxmax(), df_scores[df_scores.idxmax()]

(1, 0.7304761904761905)

In [129]:
X

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [0]:
X_old = X.copy()
from sklearn import preprocessing
X = pd.DataFrame(preprocessing.scale(X), columns=X_old.columns)


In [0]:
scores = get_cross_val_scores_knn(50)
df_scores = pd.DataFrame(scores).T.mean(axis=1)

In [132]:
df_scores.idxmax(), df_scores[df_scores.idxmax()]

(29, 0.9776190476190475)

In [0]:
# from sklearn import datasets
data = datasets.load_boston()

In [134]:
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [135]:
data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [0]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns = ['Target'])

In [137]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [0]:
X = pd.DataFrame(prep.scale(X), columns = data.feature_names)
label_encoder = prep.LabelEncoder()
y = label_encoder.fit_transform(y.Target)

In [139]:
print(utils.multiclass.type_of_target(y))

multiclass


In [140]:
X.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,-8.787437000000001e-17,-6.343191e-16,-2.682911e-15,4.701992e-16,2.490322e-15,-1.14523e-14,-1.407855e-15,9.210902e-16,5.441409e-16,-8.868619e-16,-9.205636e-15,8.163101e-15,-3.370163e-16
std,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099
min,-0.4197819,-0.4877224,-1.557842,-0.2725986,-1.465882,-3.880249,-2.335437,-1.267069,-0.9828429,-1.31399,-2.707379,-3.907193,-1.531127
25%,-0.4109696,-0.4877224,-0.8676906,-0.2725986,-0.9130288,-0.5686303,-0.837448,-0.8056878,-0.6379618,-0.767576,-0.4880391,0.2050715,-0.79942
50%,-0.3906665,-0.4877224,-0.2110985,-0.2725986,-0.1442174,-0.1084655,0.3173816,-0.2793234,-0.5230014,-0.4646726,0.274859,0.3811865,-0.1812536
75%,0.00739656,0.04877224,1.015999,-0.2725986,0.598679,0.4827678,0.9067981,0.6623709,1.661245,1.530926,0.8065758,0.433651,0.6030188
max,9.933931,3.804234,2.422565,3.668398,2.732346,3.555044,1.117494,3.960518,1.661245,1.798194,1.638828,0.4410519,3.548771


In [141]:
y

array([137, 113, 196, 193, 203, 165, 126, 156,  64,  87,  52,  87, 114,
       102,  80,  97, 128,  73, 100,  80,  40,  94,  54,  48,  57,  42,
        65,  50,  82, 108,  32,  48,  36,  35,  39,  87,  98, 108, 144,
       176, 197, 153, 149, 144, 110,  91,  98,  65,  47,  92,  95, 103,
       146, 131,  87, 200, 144, 181, 130,  94,  85,  59, 119, 146, 189,
       132,  92, 117,  72, 107, 139, 114, 125, 131, 138, 111,  98, 106,
       110, 101, 159, 136, 145, 126, 136, 153, 122, 119, 133, 165, 123,
       117, 126, 146, 104, 162, 111, 211, 219, 191, 157, 152,  84,  91,
        99,  93,  93, 102,  96,  92, 114, 125,  86,  85,  83,  81, 110,
        90, 102,  91, 117, 101, 103,  71,  86, 111,  58,  61,  78,  46,
        90,  94, 127,  82,  57,  79,  72,  69,  37,  76,  43,  47,  38,
        57,  25,  41,  57,  49,  76,  56, 112,  94,  55,  92,  68,  57,
        35, 213, 140, 130, 155, 228, 228, 228, 124, 146, 228, 135, 135,
       120,  72,  89, 128, 133, 123, 168, 129, 143, 171, 207, 21

In [142]:
def get_cross_val_scores_knn_parametric(parameter_name, parameter_values, **kwargs):
    scores = {}
    params = {}
    params.update(kwargs)
    print("testing parameters:", kwargs)
    for test_value in parameter_values:
        params[parameter_name] = test_value
        clf  = KNeighborsClassifier(**params)
        clf.fit(X, y)
        scores[test_value] = cross_val_score(clf, X, y, cv=KFold(n_splits = 5, shuffle = True, random_state = 42), scoring='neg_mean_squared_error')
    return scores

parameter_name = 'p'
parameter_values = np.linspace(1, 10, 200)
model_parameters = {'n_neighbors': 5, 'weights': 'distance', 'metric': 'minkowski'}
result = get_cross_val_scores_knn_parametric(parameter_name, parameter_values, **model_parameters)

testing parameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'minkowski'}


In [0]:
df_scores = pd.DataFrame(result).T

In [0]:
df_scores = df_scores.mean(axis=1)

In [145]:
df_scores.idxmax(), df_scores[df_scores.idxmax()]

(1.0452261306532664, -746.4670743544942)

#Features normalization

In [0]:
dataset_url = '/content/drive/My Drive/coursera/Week 2/perceptron-test.csv'
test = pd.read_csv(dataset_url,  header=None)
dataset_url = '/content/drive/My Drive/coursera/Week 2/perceptron-train.csv'
train = pd.read_csv(dataset_url, header=None)

In [147]:
X_train = train.drop([0], axis = 1)
X_test = test.drop([0], axis = 1)
X_train.shape, X_test.shape

((300, 2), (200, 2))

In [148]:
y_train = train.loc[:, 0]
y_test = test.loc[:, 0]
y_train.shape, y_test.shape

((300,), (200,))

In [164]:
from sklearn.linear_model import Perceptron
clf = Perceptron(random_state=241,max_iter=5, tol=None)
clf.fit(X_train, y_train)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=5, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=241, shuffle=True, tol=None,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [165]:
from sklearn.metrics import accuracy_score
non_scaled_accuracy = accuracy_score(y_test, clf.predict(X_test))
print(non_scaled_accuracy)

0.655


In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [167]:
#clf = Perceptron(random_state=241,max_iter=5, tol=None)
clf.fit(X_train_scaled, y_train)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=5, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=241, shuffle=True, tol=None,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [168]:
scaled_accuracy = accuracy_score(y_test, clf.predict(X_test_scaled))
print(scaled_accuracy)

0.84


In [169]:
round(scaled_accuracy - non_scaled_accuracy, 3)

0.185