<h1 style='text-align: center; color: blue;'>DS 203 Assignment 6: Linear and Logistic Regression</h1>
<h3 style='text-align: right; color: red;'>~ Shubham Lohiya, 18D100020</h3>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import display

seed = 123
np.random.seed(seed)

# Exercise 1: (Linear Regression)

In [2]:
df = pd.read_excel('Real estate valuation data set.xlsx')
df = df.drop(columns=['No']).iloc[1:]
df.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
1,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1
5,2012.666667,7.1,2175.03,3,24.96305,121.51254,32.1


In [3]:
def get_data(df, test_size=0.2, random_state=seed, verbose=True):
    def get_dataset_from_df(df):
        arr = df.to_numpy()
        X, y = arr[:, :-1], arr[:, -1]
        return X, y

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    train_X, train_y = get_dataset_from_df(train_df)
    test_X, test_y = get_dataset_from_df(test_df)
    
    if verbose:
        print(f'Trainset: X - {train_X.shape}, Y - {train_y.shape}')
        print(f'Testset: X - {test_X.shape}, Y - {test_y.shape}')
    
    return train_X, train_y, test_X, test_y


def fit_reg_model(train_X, train_y, test_X=None, test_y=None, model_type='ridge', alpha=1.0, verbose=True):
    mse, r2 = None, None
    models = {'simple': linear_model.LinearRegression,
          'ridge': linear_model.Ridge,
          'lasso': linear_model.Lasso}
    if model_type.lower() not in models.keys():
        raise ValueError("Invalid model type, choose from {'simple', 'ridge', 'lasso'}")
    model = models[model_type]() if model_type == 'simple' else models[model_type](alpha=alpha)
    model.fit(train_X, train_y)
    if verbose:
        print(f'Trained model:\n\n Coefficients:\n{model.coef_}\n\n Intercept: {model.intercept_}\n')
    if test_X is not None and test_y is not None:
        preds = model.predict(test_X)
        mse, r2 = np.round(mean_squared_error(test_y, preds), 4), np.round(r2_score(test_y, preds), 4)
        if verbose:
            print(f'Test set results:\n')
            print(f'MSE: {mse}')
            print(f'r2 Score: {r2}')
            
    return model, (mse, r2)

### Prepare Dataset

In [4]:
train_X, train_y, test_X, test_y = get_data(df, test_size=0.2)

Trainset: X - (330, 6), Y - (330,)
Testset: X - (83, 6), Y - (83,)


### Linear Regression

In [5]:
logs = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y, model_type='simple')

Trained model:

 Coefficients:
[ 6.08718711e+00 -2.52176180e-01 -4.77065632e-03  9.86505903e-01
  2.07070866e+02 -1.01317070e+01]

 Intercept: -16149.885202552152

Test set results:

MSE: 48.4298
r2 Score: 0.6961


### Ridge Regression

In [6]:
logs = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y, model_type='ridge')

Trained model:

 Coefficients:
[ 6.51803603e+00 -2.39993266e-01 -5.72696190e-03  1.05465753e+00
  6.82784606e+00 -1.06281086e+00]

 Intercept: -13118.946312609578

Test set results:

MSE: 55.6732
r2 Score: 0.6506


### Lasso Regression

In [7]:
logs = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y, model_type='lasso')

Trained model:

 Coefficients:
[ 0.         -0.22531194 -0.00580186  0.94447023  0.         -0.        ]

 Intercept: 44.499131730628115

Test set results:

MSE: 56.6011
r2 Score: 0.6448


### Effect of data split ratio and regularization parameter

In [8]:
splits = [0.4, 0.3, 0.1]
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
split_headings = [f'{int(100*(1-i))}:{int(100*i)}' for i in splits]

#### Linear Regression

In [9]:
res = {split: dict() for split in split_headings}
for split, heading in zip(splits, split_headings):
    train_X, train_y, test_X, test_y = get_data(df, test_size=split, verbose=False)
    _, (mse, r2) = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y,
                             model_type='simple', verbose=False)
    res[heading]['mse'] = mse
    res[heading]['r2'] = r2

print('Linear Regression - effect of data split ratio')
display(pd.DataFrame(res))

Linear Regression - effect of data split ratio


Unnamed: 0,60:40,70:30,90:10
mse,56.104,59.0785,48.3609
r2,0.6382,0.6418,0.6748


#### Ridge Regression

In [10]:
res = np.empty((len(alphas), len(splits)), dtype=str).tolist()
for i, alpha in enumerate(alphas):
    for j, split in enumerate(splits):
        train_X, train_y, test_X, test_y = get_data(df, test_size=split, verbose=False)
        _, (mse, r2) = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y,
                                 model_type='ridge', alpha = alpha, verbose=False)
        res[i][j] = f'mse: {mse}, r2: {r2}'

print('Ridge Regression - effect of data split ratio and regularization parameter')
display(pd.DataFrame(data=res, index=alphas, columns=split_headings))

Ridge Regression - effect of data split ratio and regularization parameter


Unnamed: 0,60:40,70:30,90:10
0.001,"mse: 56.0063, r2: 0.6389","mse: 59.133, r2: 0.6415","mse: 48.3696, r2: 0.6748"
0.005,"mse: 55.7886, r2: 0.6403","mse: 59.3852, r2: 0.6399","mse: 48.4521, r2: 0.6742"
0.01,"mse: 55.7377, r2: 0.6406","mse: 59.7247, r2: 0.6379","mse: 48.6157, r2: 0.6731"
0.05,"mse: 56.5974, r2: 0.6351","mse: 61.6428, r2: 0.6263","mse: 49.9688, r2: 0.664"
0.1,"mse: 57.2845, r2: 0.6306","mse: 62.6805, r2: 0.62","mse: 50.8876, r2: 0.6578"
0.5,"mse: 58.2777, r2: 0.6242","mse: 64.0929, r2: 0.6114","mse: 52.3477, r2: 0.648"


#### Lasso Regression

In [11]:
res = np.empty((len(alphas), len(splits)), dtype=str).tolist()
for i, alpha in enumerate(alphas):
    for j, split in enumerate(splits):
        train_X, train_y, test_X, test_y = get_data(df, test_size=split, verbose=False)
        _, (mse, r2) = fit_reg_model(train_X, train_y, test_X=test_X, test_y=test_y,
                                 model_type='lasso', alpha = alpha, verbose=False)
        res[i][j] = f'mse: {mse}, r2: {r2}'

print('Lasso Regression - effect of data split ratio and regularization parameter')
display(pd.DataFrame(data=res, index=alphas, columns=split_headings))

Lasso Regression - effect of data split ratio and regularization parameter


Unnamed: 0,60:40,70:30,90:10
0.001,"mse: 55.9502, r2: 0.6392","mse: 59.1764, r2: 0.6412","mse: 48.4315, r2: 0.6744"
0.005,"mse: 55.7275, r2: 0.6407","mse: 59.6281, r2: 0.6385","mse: 48.6645, r2: 0.6728"
0.01,"mse: 55.786, r2: 0.6403","mse: 60.5392, r2: 0.6329","mse: 49.3349, r2: 0.6683"
0.05,"mse: 58.4947, r2: 0.6228","mse: 64.3884, r2: 0.6096","mse: 52.731, r2: 0.6454"
0.1,"mse: 58.3026, r2: 0.6241","mse: 64.1018, r2: 0.6113","mse: 52.5384, r2: 0.6467"
0.5,"mse: 59.566, r2: 0.6159","mse: 64.6232, r2: 0.6082","mse: 53.295, r2: 0.6417"


# Exercise 2: (Logistic Regression)

In [12]:
df = pd.read_csv('haberman.data', header=None, names=['age', 'op_yr - 1900', 'axillary nodes', 'Y'])
df.Y = df.Y.apply(lambda x: 1 if x==1 else 0)
df.head()

Unnamed: 0,age,op_yr - 1900,axillary nodes,Y
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [13]:
def fit_logistic_model(train_X, train_y, test_X = None, test_y=None, verbose=True):
    error = None

    model = linear_model.LogisticRegression()
    model.fit(train_X, train_y)

    if verbose: print(f'Trained model:\n\n Coefficients:\n{model.coef_}\n\n Intercept: {model.intercept_}\n')
    
    if test_X is not None and test_y is not None:
        preds = model.predict(test_X)
        error = np.round(np.mean(preds != test_y), 4)
        if verbose:
            print(f'Error: {error}')
    
    return model, error

In [14]:
train_X, train_y, test_X, test_y = get_data(df, test_size=0.2)

Trainset: X - (244, 3), Y - (244,)
Testset: X - (62, 3), Y - (62,)


In [15]:
logs = fit_logistic_model(train_X, train_y, test_X=test_X, test_y=test_y)

Trained model:

 Coefficients:
[[-0.02633861  0.01323499 -0.08835826]]

 Intercept: [1.95297432]

Error: 0.2581


In [16]:
res = {split: dict() for split in split_headings}
for split, heading in zip(splits, split_headings):
    train_X, train_y, test_X, test_y = get_data(df, test_size=split, verbose=False)
    _, error = fit_logistic_model(train_X, train_y, test_X=test_X, test_y=test_y, verbose=False)
    res[heading]['classification error'] = error

print('Logistic Regression - effect of data split ratio')
display(pd.DataFrame(res))

Logistic Regression - effect of data split ratio


Unnamed: 0,60:40,70:30,90:10
classification error,0.3008,0.2935,0.1613


In [17]:
m = linear_model.Ridge()


In [None]:
m.fit()