In [64]:
from sklearn.linear_model import LinearRegression , Ridge , LogisticRegression , Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pandas as pd
import numpy as np

In [11]:
boston = datasets.load_boston()

In [12]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [14]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [16]:
boston_data = pd.DataFrame(boston.data,columns=boston.feature_names)
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [17]:
boston_data['ROOM'] = boston_data['RM'].apply(lambda x: int(x))
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,ROOM
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,6
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,6
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,7


In [19]:
boston_data = boston_data.drop(columns='RM',axis=1)
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,ROOM
0,0.00632,18.0,2.31,0.0,0.538,65.2,4.09,1.0,296.0,15.3,396.9,4.98,6
1,0.02731,0.0,7.07,0.0,0.469,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,6
2,0.02729,0.0,7.07,0.0,0.469,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,7
3,0.03237,0.0,2.18,0.0,0.458,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,6
4,0.06905,0.0,2.18,0.0,0.458,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,7


In [22]:
boston_data_dummy=pd.get_dummies(boston_data['ROOM'],drop_first=True)
boston_data_dummy.head()

Unnamed: 0,4,5,6,7,8
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,0,0,0,1,0


In [23]:
boston_data1 = pd.concat([boston_data,boston_data_dummy],axis=1)

In [24]:
boston_data1.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,ROOM,4,5,6,7,8
0,0.00632,18.0,2.31,0.0,0.538,65.2,4.09,1.0,296.0,15.3,396.9,4.98,6,0,0,1,0,0
1,0.02731,0.0,7.07,0.0,0.469,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,6,0,0,1,0,0
2,0.02729,0.0,7.07,0.0,0.469,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,7,0,0,0,1,0
3,0.03237,0.0,2.18,0.0,0.458,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,6,0,0,1,0,0
4,0.06905,0.0,2.18,0.0,0.458,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,7,0,0,0,1,0


In [25]:
boston_data1 = boston_data1.drop(columns='ROOM',axis=1)

In [26]:
boston_data1.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,4,5,6,7,8
0,0.00632,18.0,2.31,0.0,0.538,65.2,4.09,1.0,296.0,15.3,396.9,4.98,0,0,1,0,0
1,0.02731,0.0,7.07,0.0,0.469,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,0,0,1,0,0
2,0.02729,0.0,7.07,0.0,0.469,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,0,0,0,1,0
3,0.03237,0.0,2.18,0.0,0.458,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,0,0,1,0,0
4,0.06905,0.0,2.18,0.0,0.458,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,0,0,0,1,0


In [98]:
X = np.array(boston_data1)
y = boston.target

In [99]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [100]:
ridge_reg = Ridge(alpha=0.001,normalize=True)
ridge_reg.fit(X_train,y_train)
ridge_reg.score(X_test,y_test)

0.7651020039480987

In [101]:
ridge_reg.coef_

array([-1.32969704e-01,  1.77089579e-02,  9.43728625e-02,  2.95085290e+00,
       -1.64647985e+01, -6.64796163e-03, -1.02525630e+00,  2.40408295e-01,
       -9.77366898e-03, -7.27987468e-01,  8.66406784e-03, -6.31159911e-01,
        5.76303674e-02, -4.55492550e+00, -4.31169522e+00,  4.30091943e+00,
        1.02644832e+01])

In [102]:
reg = LinearRegression()
reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.7650591780134637

In [94]:
reg.coef_

array([-1.33458222e-01,  1.79289808e-02,  9.59748097e-02,  2.94660908e+00,
       -1.65381289e+01, -6.65820052e-03, -1.02962723e+00,  2.43638428e-01,
       -9.93350799e-03, -7.29748444e-01,  8.70752314e-03, -6.31831347e-01,
       -4.32692408e-01, -5.04789628e+00, -4.80838680e+00,  3.80033266e+00,
        9.76731673e+00])

In [95]:
data2 = pd.DataFrame({'col1':[np.nan,0,3,6,9,5],
                     'col2':[5,6,9,2,5,np.nan],
                     'col3':[10,7,4,9,8,np.nan],
                     'col4':[11,34,67,95,np.nan,np.nan]})
print(data2)

   col1  col2  col3  col4
0   NaN   5.0  10.0  11.0
1   0.0   6.0   7.0  34.0
2   3.0   9.0   4.0  67.0
3   6.0   2.0   9.0  95.0
4   9.0   5.0   8.0   NaN
5   5.0   NaN   NaN   NaN


In [53]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan,strategy='mean')
imp.fit(data2)

data2 = imp.transform(data2)
print(data2)

[[ 4.6   5.   10.   11.  ]
 [ 0.    6.    7.   34.  ]
 [ 3.    9.    4.   67.  ]
 [ 6.    2.    9.   95.  ]
 [ 9.    5.    8.   51.75]
 [ 5.    5.4   7.6  51.75]]


# Imputing with pipeline

In [68]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan , strategy='mean')
reg = Lasso(alpha=0.005,normalize=True)

steps = [('imputation' , imp),
        ('linear_regression' , reg)]

pipeline = Pipeline(steps)
pipeline.fit(X_train,y_train)

Pipeline(steps=[('imputation', SimpleImputer()),
                ('linear_regression', Lasso(alpha=0.005, normalize=True))])

In [62]:
pipeline.score(X_test,y_test)

0.7649605698773048

# Scaling in scikitlearn

In [69]:
from sklearn.preprocessing import scale

In [108]:
X_scaled = scale(boston.data)

In [109]:
print(np.mean(boston.data),np.mean(X_scaled),np.std(boston.data),np.std(X_scaled))

70.07396704469443 -1.1147462804871136e-15 145.1555388220164 0.9999999999999994


# Scaling in pipeline

In [112]:
from sklearn.preprocessing import StandardScaler
steps = [('scale',StandardScaler()),
        ('reg',LinearRegression())]

pipeline = Pipeline(steps)
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=42)

knn_scaled = pipeline.fit(X_train,y_train)
knn_scaled.score(X_test,y_test)

0.7650591780134637

In [113]:
knn_unscaled = LinearRegression().fit(X_train,y_train)
knn_unscaled.score(X_test,y_test)

0.7650591780134637

## Classification algorithm

In [122]:
cancer = datasets.load_breast_cancer()

In [123]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [124]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [125]:
X = cancer.data
y = cancer.target

In [140]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

steps = [('scale',StandardScaler()),
        ('knn',KNeighborsClassifier())]

pipeline = Pipeline(steps)

parameter = {'knn__n_neighbors':np.arange(1,15)}

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=21)

cv = GridSearchCV(pipeline,param_grid=parameter,cv=5)
cv.fit(X_train,y_train)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])})

In [141]:
cv.score(X_test,y_test)

0.9824561403508771

In [143]:
y_pred = cv.predict(X_test)
y_pred_prob = cv.predict_proba(X_test)[:,1]

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.97      1.00      0.99        75

    accuracy                           0.98       114
   macro avg       0.99      0.97      0.98       114
weighted avg       0.98      0.98      0.98       114

[[37  2]
 [ 0 75]]


In [144]:
print(roc_auc_score(y_test,y_pred_prob))

0.9991452991452991
