In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression, RANSACRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error

# Linear and Logistic Regression
## Live Demos

## Linear Regression

In [3]:
boston_data = pd.read_fwf(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", header = None)

In [4]:
boston_data.shape

(506, 14)

In [5]:
boston_data.columns = ["crime_rate", "zoned_land", "industry", "bounds_river", "nox_conc", "rooms", "age", "distance",  "highways", "tax", "pt_ratio", "b_estimator", "pop_status", "price"]

In [6]:
boston_data.head()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [7]:
boston_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crime_rate,506.0,1.71629,2.65351,0.00632,0.0819,0.250895,2.326718,9.96654
zoned_land,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
industry,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
bounds_river,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nox_conc,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
rooms,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
age,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
distance,506.0,3.696228,1.999689,0.5857,2.0737,3.1073,5.112625,9.2229
highways,506.0,4.332016,1.417166,1.0,4.0,4.0,5.0,8.0
tax,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [8]:
attributes = boston_data.drop("price", axis = 1)
target = boston_data.price

In [9]:
boston_data.corr()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
crime_rate,1.0,-0.300774,0.590822,0.013922,0.634679,-0.190197,0.482013,-0.495148,-0.088451,0.793392,0.362615,-0.377013,0.481907,-0.362077
zoned_land,-0.300774,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.56666,-0.11929,-0.314563,-0.391679,0.17552,-0.412995,0.360445
industry,0.590822,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.678498,-0.087615,0.72076,0.383248,-0.356977,0.6038,-0.483725
bounds_river,0.013922,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.09095,0.079105,-0.035587,-0.121515,0.048788,-0.053929,0.17526
nox_conc,0.634679,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.748872,0.009217,0.668023,0.188933,-0.380051,0.590879,-0.427321
rooms,-0.190197,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.225052,0.088753,-0.292048,-0.355501,0.128069,-0.613808,0.69536
age,0.482013,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.713313,0.019658,0.506456,0.261515,-0.273534,0.602339,-0.376955
distance,-0.495148,0.56666,-0.678498,-0.09095,-0.748872,0.225052,-0.713313,1.0,0.00303,-0.541369,-0.26914,0.293621,-0.479158,0.264325
highways,-0.088451,-0.11929,-0.087615,0.079105,0.009217,0.088753,0.019658,0.00303,1.0,-0.049221,-0.116969,0.040705,-0.069828,0.113519
tax,0.793392,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.541369,-0.049221,1.0,0.460853,-0.441808,0.543993,-0.468536


In [10]:
model = LinearRegression()

In [11]:
model.fit(attributes, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
model.coef_

array([ 2.09281375e-01,  1.49403979e-02,  1.27164577e-02,  3.00565375e+00,
       -1.55234852e+01,  4.29955958e+00,  2.84848139e-03, -1.08366345e+00,
        1.93258621e-01, -2.42034372e-03, -9.65535221e-01,  9.43510233e-03,
       -5.25242783e-01])

In [13]:
model.intercept_

28.30511075009888

In [14]:
model.score(attributes, target)

0.7198065414937174

In [15]:
model_no_intercept = LinearRegression(fit_intercept = False)

In [16]:
model_no_intercept.fit(attributes, target)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [17]:
model_no_intercept.coef_

array([ 2.32266653e-02,  2.77104570e-02,  1.27058681e-02,  3.01506936e+00,
       -3.68298191e+00,  5.82571154e+00, -2.57508598e-03, -7.26850898e-01,
        3.10921428e-01, -3.14922442e-03, -5.07184355e-01,  1.39593084e-02,
       -4.43092704e-01])

In [18]:
model_no_intercept.intercept_

0.0

In [19]:
model_no_intercept.score(attributes, target)

0.7036175611789843

In [20]:
test_data = boston_data.sample(20)

In [21]:
test_attributes = test_data.drop("price", axis = 1)

In [22]:
model.predict(test_attributes)

array([28.54030418, 18.13706705, 18.66705389, 20.76974663, 13.82919695,
       32.37653591, 22.17313202, 31.71111339, 18.75511091, 23.47600838,
       16.09778633, 23.6319635 , 14.22490657, 29.04535626, 15.6806582 ,
       20.88652496, 10.71588058, 23.54226476, 20.13129295, 42.63034174])

In [23]:
test_data["predicted_price"] = model.predict(test_attributes)

In [24]:
test_data

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price,predicted_price
305,0.05479,33.0,2.18,0,0.472,6.616,58.1,3.37,7,222.0,18.4,393.36,8.93,28.4,28.540304
407,1.9511,0.0,18.1,0,0.659,5.608,100.0,1.2852,4,666.0,20.2,332.09,12.13,27.9,18.137067
31,1.35472,0.0,8.14,0,0.538,6.072,100.0,4.175,4,307.0,21.0,376.73,13.04,14.5,18.667054
107,0.13117,0.0,8.56,0,0.52,6.127,85.2,2.1224,5,384.0,20.9,387.69,14.09,20.4,20.769747
405,7.9208,0.0,18.1,0,0.693,5.683,100.0,1.4254,4,666.0,20.2,384.97,22.98,5.0,13.829197
275,0.09604,40.0,6.41,0,0.447,6.854,42.8,4.2673,4,254.0,17.6,396.9,2.98,32.0,32.376536
205,0.13642,0.0,10.59,0,0.489,5.891,22.3,3.9454,4,277.0,18.6,396.9,10.87,22.6,22.173132
369,5.66998,0.0,18.1,1,0.631,6.683,96.8,1.3567,4,666.0,20.2,375.33,3.73,50.0,31.711113
470,4.34879,0.0,18.1,0,0.58,6.167,84.0,3.0334,4,666.0,20.2,396.9,16.29,19.9,18.755111
330,0.04544,0.0,3.24,0,0.46,6.144,32.2,5.8736,4,430.0,16.9,368.57,9.09,19.8,23.476008


In [25]:
np.sqrt(mean_squared_error(test_data.price, test_data.predicted_price))

5.824308110099783

## RANSAC

In [26]:
ransac = RANSACRegressor(LinearRegression(), min_samples = 50, max_trials = 100, residual_threshold = 5.0)

In [27]:
ransac.fit(attributes, target)

RANSACRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                                n_jobs=None, normalize=False),
                is_data_valid=None, is_model_valid=None, loss='absolute_loss',
                max_skips=inf, max_trials=100, min_samples=50,
                random_state=None, residual_threshold=5.0, stop_n_inliers=inf,
                stop_probability=0.99, stop_score=inf)

In [28]:
ransac.estimator_.coef_

array([-2.18109825e-01, -5.74409285e-03,  3.79102863e-02,  1.62880980e+00,
       -5.52986664e+00,  7.93968301e+00, -2.50424247e-02, -5.88454814e-01,
        9.41802681e-02, -4.67142590e-03, -7.37652347e-01,  1.39543152e-02,
       -2.59429233e-01])

In [29]:
ransac.score(attributes, target)

0.6590145319733779

In [30]:
inlier_attributes = attributes[ransac.inlier_mask_]
inlier_target = target[ransac.inlier_mask_]

In [31]:
ransac.score(inlier_attributes, inlier_target)

0.9159124289754914

In [32]:
outlier_attributes = attributes[~ransac.inlier_mask_]
outlier_target = target[~ransac.inlier_mask_]

In [33]:
ransac.score(outlier_attributes, outlier_target)

0.18933764566698955

## Polynomial Regression

In [34]:
polynomial_transformer = PolynomialFeatures()

In [35]:
polynomial_transformer.fit(attributes)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [36]:
second_degree_attributes = polynomial_transformer.transform(attributes)

In [37]:
second_degree_attributes.shape

(506, 105)

In [38]:
second_degree_model = LinearRegression()

In [39]:
second_degree_model.fit(second_degree_attributes, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [40]:
second_degree_model.score(second_degree_attributes, target)

0.9006055895297789

## Logistic Regression

In [41]:
iris = load_iris()

In [42]:
iris_model = LogisticRegression(C = 1e9)

In [43]:
iris_model.fit(iris.data, iris.target)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
iris_model.score(iris.data, iris.target)

0.9866666666666667

In [45]:
iris_poly = PolynomialFeatures(degree = 4).fit_transform(iris.data)

In [46]:
iris_poly.shape

(150, 70)

In [47]:
iris_model_poly = LogisticRegression()

In [48]:
iris_model_poly.fit(iris_poly.data, iris.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
iris_model_poly.score(iris_poly, iris.target)

0.9866666666666667

In [50]:
iris_testing_data = iris.data[:10]

In [51]:
iris_model.predict(iris_testing_data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [52]:
iris.target[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [53]:
iris_model.predict_proba(iris_testing_data)

array([[1.00000000e+00, 2.09849026e-31, 3.24053933e-58],
       [1.00000000e+00, 1.23433269e-24, 8.81060983e-50],
       [1.00000000e+00, 6.42931955e-28, 7.69223239e-54],
       [1.00000000e+00, 8.92914017e-23, 1.75765533e-47],
       [1.00000000e+00, 3.66133853e-32, 3.70905957e-59],
       [1.00000000e+00, 4.56465373e-31, 1.52474263e-56],
       [1.00000000e+00, 1.43611153e-27, 9.23659749e-53],
       [1.00000000e+00, 4.98676049e-28, 4.93460074e-54],
       [1.00000000e+00, 2.52600099e-21, 1.20632234e-45],
       [1.00000000e+00, 1.41306652e-24, 2.13277204e-50]])

In [54]:
iris_model.coef_

array([[  7.35271466,  20.39778454, -30.26348739, -14.14337754],
       [ -2.44376492,  -6.85843959,  10.41704506,  -2.07138612],
       [ -4.90894974, -13.53934495,  19.84644233,  16.21476366]])

In [55]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [56]:
"Iris-setosa" -> LabelEncoder

SyntaxError: invalid syntax (<ipython-input-56-85b1b6542e04>, line 1)

## Quiz

In [57]:
wine_data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ";")

In [58]:

wine_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [66]:
anwine_model = LinearRegression()
attributes_wine = wine_data.drop("quality", axis = 1)
target_wine = wine_data.quality
model.fit(attributes_wine, target_wine)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [67]:
model.intercept_

21.965208449451673

In [68]:
model.score(attributes, target)

0.36055170303868833

In [69]:
polynomial_transformer.fit(attributes_wine)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [70]:
wine_poly = polynomial_transformer.transform(attributes_wine)

In [71]:
wine_poly.shape

(1599, 78)

In [72]:
new_model = LinearRegression()

In [73]:
new_model.fit(wine_poly, target_wine)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [74]:
new_model.score(wine_poly, target_wine)

0.4346357868787144