## Linear Regression

In [1]:
# House pricing dataset in sklearn 
from sklearn.datasets import load_boston


In [2]:
# import basic libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=load_boston()
type(df)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

sklearn.utils.Bunch

In [4]:
# data is in the form of key value pair; Data, target & feature names are given
# We need to combine the data in a proper way in the form of a dataframe. 
df
## independent features
dataset=pd.DataFrame(df.data) # 12 features
dataset.columns=df.feature_names
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
## Dependent features (Price) Create feature name as price and assign it with the target value
dataset['Price']=df.target

In [6]:
# price column adds with feature
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [7]:
## Dividing the dataset into independent & dependent features
X=dataset.iloc[:,:-1] # independent features (all rows & all columns except last column)
y=dataset.iloc[:,-1]  # dependent features (allrows & last column)

In [8]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: Price, dtype: float64

In [10]:
## Algorithm 1: LINEAR REGRESSION
# import libraries for linear regression
from sklearn.linear_model import LinearRegression
# For cross validation import cross_val_score to perform train test split multiple times
from sklearn.model_selection import cross_val_score
# initialize lin_reg
lin_reg=LinearRegression()

# Next apply cross validation
'''In cross validation we divide the train & test data in such a way that every combination of train & test data is taken 
by the model & whosever accurac is better those are combined '''
# compute mean square error; We will get 5 MSE as 5 cross validation is performed.
mse=cross_val_score(lin_reg,X,y,scoring='neg_mean_squared_error',cv=5)
print(mse)
# take average of 5 mse
mean_mse=np.mean(mse)
print(mean_mse)

[-12.46030057 -26.04862111 -33.07413798 -80.76237112 -33.31360656]
-37.131807467699055


## Ridge & Lasso Regression
With respect to linear regression we can't modify much with the parameter; So to overcome overfitting & do feature selection we use rigde & lasso regression. 

# Rigde regression

In [11]:
# imoort libraies
from sklearn.linear_model import Ridge
# for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [25]:
# Now define the model
ridge=Ridge()
# B4 performing gridsearch define the parameters in the form of dictionaries
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
# In grid search all the combination of alpha values r taken & whereever the model performs well 
# it is going to take that specific parameter & tell that this is the best fit parameter which got selected. 
# next apply the gridsearch CV (model,parameters,scoring,cv)
ridge_regressor=GridSearchCV(ridge,params,scoring='neg_mean_squared_error',cv=10)

ridge_regressor.fit(X,y)
                        

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [26]:
# Now get the best parameters
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)
# previously -37 was obtd, but due to ridge regression mse icreased to -29; Performance not good so go for lasso


{'alpha': 100}
-29.61522009733517


# Lasso Regression

In [23]:
# imoort libraies
from sklearn.linear_model import Lasso
# for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
# Now define the model
lasso=Lasso()
# B4 performing gridsearch define the parameters in the form of dictionaries
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
# In grid search all the combination of alpha values r taken & whereever the model performs well 
# it is going to take that specific parameter & tell that this is the best fit parameter which got selected. 
# next apply the gridsearch CV (model,parameters,scoring,cv)
lasso_regressor=GridSearchCV(lasso,params,scoring='neg_mean_squared_error',cv=10)

lasso_regressor.fit(X,y)
                        

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40, 45, 50, 55, 100]},
             scoring='neg_mean_squared_error')

In [24]:
# Now get the best parameters
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

{'alpha': 1}
-35.531580220694856


For both lasso & ridge the mse is increasing; So perform a train test split & try

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42) # test=33%; Train=77%



In [38]:
## Algorithm 1: LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
lin_reg=LinearRegression()
lin_reg.fit(X_train,y_train)
mse=cross_val_score(lin_reg,X_train,y_train,scoring='neg_mean_squared_error',cv=5)
print(mse)
mean_mse=np.mean(mse)
print(mean_mse)

[-33.75185215 -21.4641199  -27.97099777 -17.7140812  -25.03832267]
-25.187874739284958


In [39]:
y_pred=lin_reg.predict(X_test)
from sklearn.metrics import r2_score
r2_score1=r2_score(y_pred,y_test)
print(r2_score1)

0.6709558976744439


In [31]:
# Algorithm 2:Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge=Ridge()
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,params,scoring='neg_mean_squared_error',cv=10)
ridge_regressor.fit(X_train,y_train)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)
                        



{'alpha': 0.01}
-25.47206736336775


In [36]:
y_pred=ridge_regressor.predict(X_test)
from sklearn.metrics import r2_score
r2_score1=r2_score(y_pred,y_test)
print(r2_score1)

0.6708743257533072


In [33]:
# Algorithm 3: Lasso Regression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso=Lasso()
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
lasso_regressor=GridSearchCV(lasso,params,scoring='neg_mean_squared_error',cv=10)

lasso_regressor.fit(X_train,y_train)

print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)
                        



  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


{'alpha': 1e-08}
-25.473094572833244


In [34]:
y_pred=lasso_regressor.predict(X_test)
from sklearn.metrics import r2_score
r2_score1=r2_score(y_pred,y_test)
print(r2_score1)

# LOGISTIC REGRESSION

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
# take new dataset
from sklearn.datasets import load_breast_cancer

In [51]:
df=load_breast_cancer()
## Independent features
X=pd.DataFrame(df['data'],columns=df['feature_names'])

In [52]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [53]:
# dependent features
y=pd.DataFrame(df['target'],columns=['Target'])
y

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [54]:
# check if y column is balanced or imbalanced
y['Target'].value_counts()

1    357
0    212
Name: Target, dtype: int64

In [55]:
## Train_test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42) # test=33%; Train=77%

In [56]:
params=[{'C':[1,5,10]},{'max_iter':[100,150]}]

In [57]:
model1=LogisticRegression(C=100,max_iter=100) 

In [58]:
model=GridSearchCV(model1,param_grid=params,scoring='f1',cv=5)

In [59]:
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation f

GridSearchCV(cv=5, estimator=LogisticRegression(C=100),
             param_grid=[{'C': [1, 5, 10]}, {'max_iter': [100, 150]}],
             scoring='f1')

In [60]:
model.best_params_

{'max_iter': 150}

In [61]:
model.best_score_

0.9558624887109136

In [62]:
y_pred=model.predict(X_test)

In [63]:
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1])

In [68]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [69]:
confusion_matrix(y_pred,y_test)

array([[ 64,   3],
       [  3, 118]])

In [70]:
accuracy_score(y_pred,y_test)

0.9680851063829787

In [71]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        67
           1       0.98      0.98      0.98       121

    accuracy                           0.97       188
   macro avg       0.97      0.97      0.97       188
weighted avg       0.97      0.97      0.97       188

