In [1]:
import pathlib
import itertools
import collections

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, model_selection
from sklearn import cross_decomposition
from sklearn import decomposition
from sklearn import pipeline

In [2]:
data_folder = pathlib.Path("./../data/spam")

In [3]:
list(data_folder.iterdir())

[PosixPath('../data/spam/spam.info.txt'),
 PosixPath('../data/spam/spam.data'),
 PosixPath('../data/spam/spam.traintest')]

In [4]:
info_file = data_folder / "spam.info.txt"
data_file = data_folder / "spam.data"

### Data info

In [5]:
print(info_file.read_text())

1. Title:  SPAM E-mail Database

2. Sources:
   (a) Creators: Mark Hopkins, Erik Reeber, George Forman, Jaap Suermondt
        Hewlett-Packard Labs, 1501 Page Mill Rd., Palo Alto, CA 94304
   (b) Donor: George Forman (gforman at nospam hpl.hp.com)  650-857-7835
   (c) Generated: June-July 1999

3. Past Usage:
   (a) Hewlett-Packard Internal-only Technical Report. External forthcoming.
   (b) Determine whether a given email is spam or not.
   (c) ~7% misclassification error.
       False positives (marking good mail as spam) are very undesirable.
       If we insist on zero false positives in the training/testing set,
       20-25% of the spam passed through the filter.

4. Relevant Information:
        The "spam" concept is diverse: advertisements for products/web
        sites, make money fast schemes, chain letters, pornography...
	Our collection of spam e-mails came from our postmaster and 
	individuals who had filed spam.  Our collection of non-spam 
	e-mails came from filed work a

In [6]:
df = pd.read_csv(data_file, header=None, sep=" ")
df.rename({57: "label"}, axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [7]:
train_df, test_df = model_selection.train_test_split(df, train_size=0.3)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [8]:
def cal_z_score(df, y_hat, train_cols, label_col, coefficients):
    y = df.loc[:, label_col].values
    X = df.loc[:, train_cols].values
    num_ele = y.shape[0]
    num_preds = X.shape[1]
    
    S = np.linalg.inv(np.matmul(X.T, X))
    var = np.square(y-y_hat).sum()/(num_ele - num_preds - 1)
    
    z_scores = []
    for i, coeff in enumerate(coefficients):
        z_score = coeff/(S[i][i]*np.sqrt(S[i][i]))
        z_scores.append(z_score)
        
    return z_scores

cal_rss = lambda y, y_hat: np.square(y - y_hat).sum()/y.shape[0]

In [9]:
def train_OLS(train_df, preds_cols, label_col):
    reg = linear_model.LinearRegression(n_jobs=-1)
    reg.fit(train_df.loc[:, preds_cols], train_df.loc[:, label_col])
    return reg, preds_cols, label_col

In [10]:
reg_dict = {}

In [11]:
target = "label"
predictors = list(train_df.columns)
predictors.remove("label")

# OLS

In [12]:
ols_set = train_OLS(train_df, predictors, target)
reg_dict["ols"] = {"reg": ols_set[0], "cols": ols_set[1], "coeff": ols_set[0].coef_}

# Best subset selector

In [13]:
best_set = None
best_set_rss = -1
best_n = 3
for i in range(1, best_n):
    for cols in itertools.combinations(predictors, i):
        fit_set = train_OLS(train_df, cols, target)
        
        ## CALCULATE METRICS ON TEST SET
        y = test_df.loc[:, fit_set[2]]
        X = test_df.loc[:, fit_set[1]]
        
        y_hat = fit_set[0].predict(X)
        fit_rss = cal_rss(y, y_hat)
        if fit_rss > best_set_rss:
            best_set = fit_set
            best_set_rss = fit_rss

In [14]:
## Cal best subset scores
X = test_df.loc[:, best_set[1]]
y = test_df.loc[:, best_set[2]]

reg_dict["best subset"] = {"reg": best_set[0], "cols": best_set[1], "coeff": best_set[0].coef_}

# Ridge

In [15]:
ridge = linear_model.Ridge()
ridge.fit(train_df.loc[:, predictors], train_df.loc[:, target])

Ridge()

In [16]:
reg_dict["ridge"] = {"reg": ridge, "cols": predictors, "coeff": ridge.coef_}

# Lasso

In [17]:
lasso = linear_model.Lasso()
lasso.fit(train_df.loc[:, predictors], train_df.loc[:, target])

Lasso()

In [18]:
reg_dict["lasso"] = {"reg": lasso, "cols": predictors, "coeff": lasso.coef_}

# PCR

In [19]:
n_comps = len(predictors)
pcr = pipeline.Pipeline([
    ('pca', decomposition.PCA(n_components=n_comps)), 
    ('ols', linear_model.LinearRegression()
    )]
        )
pcr.fit(train_df.loc[:, predictors], train_df.loc[:, target])

Pipeline(steps=[('pca', PCA(n_components=57)), ('ols', LinearRegression())])

In [20]:
reg_dict["pcr"] = {"reg": pcr, "cols": predictors, "coeff": pcr.named_steps["ols"].coef_}

# PLS

In [21]:
n_comps = len(predictors)
pls = cross_decomposition.PLSRegression(n_components=n_comps)
pls.fit(train_df.loc[:, predictors], train_df.loc[:, target])

PLSRegression(n_components=57)

In [36]:
reg_dict["pls"] = {"reg": pls, "cols": predictors, "coeff": [coef[0] for coef in pls.coef_]}

# Collate results

In [42]:
results_dict = {key: {
                        value["cols"][idx]: value["coeff"][idx] 
                        for idx in range(len(value["cols"]))
                     }
                for key, value in reg_dict.items()}

results_dict = {key: {**value, **{"avg" : sum(value.values())}}
                for key, value in results_dict.items()}

In [43]:
results = pd.DataFrame.from_dict(results_dict)

In [44]:
results

Unnamed: 0,ols,best subset,ridge,lasso,pcr,pls
0,-0.043206,,-0.043765,0.0,0.00021,-0.012157
1,-0.012887,,-0.012716,-0.0,0.000843,-0.017152
2,0.014509,,0.015208,0.0,-0.002523,0.006994
3,0.009416,,0.009455,0.0,-0.025712,0.009189
4,0.090294,,0.089911,0.0,-0.084523,0.056738
5,0.128448,,0.12794,0.0,0.029253,0.033803
6,0.213599,,0.213686,0.0,-0.010038,0.081964
7,0.084915,,0.084343,0.0,0.103894,0.03861
8,0.079565,,0.080278,0.0,-0.051765,0.02095
9,0.032111,,0.032245,0.0,0.028895,0.019806
