In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('NPPE1_ModelBuilding3.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.087472,0.002829,1.768235,0.188396,0.117071,0.148148,0.478165,0.720443,0.967195,0.281563,0.428571,0.959596,1.0,0.0,23.358097
1,0.378379,0.022079,1.115629,0.091974,0.066089,0.851852,0.911759,0.785321,0.885001,0.424648,0.285714,0.868687,0.0,1.0,17.268768
2,0.066901,0.003828,-0.536262,0.221188,0.255671,0.296296,0.228024,0.406472,0.980184,0.274376,0.428571,0.767677,0.0,1.0,27.776974
3,0.140645,0.011132,1.323366,0.422514,0.153103,0.148148,0.410679,0.200319,0.861371,0.305006,0.142857,0.848485,0.0,1.0,16.12196
4,0.144225,0.204918,-0.93079,0.148694,0.17749,0.259259,0.146832,0.111429,0.983448,0.286322,0.285714,0.616162,1.0,0.0,23.129426


In [24]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['14'], axis=1)
y = df['14']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [25]:
from sklearn.linear_model import Ridge
model = Ridge(
    alpha=10,
    solver='saga',
    tol=1e-4,
    random_state=42
)

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6613547575262211

In [26]:
model.coef_

array([ -0.4825441 ,   3.74601838,  -0.73583331,   0.54199933,
        -9.89014109,   5.80114296,  -5.06099736,  -9.45015598,
         4.73124885, -23.51321982,  11.31863371,   0.49450664,
        -0.89196134,   0.89196134])

In [27]:
# Most important feature index
abs(model.coef_).argmax()

9

In [28]:
# Least important feature
abs(model.coef_).argmin()

0

#### SGD

In [29]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

model = SGDRegressor(random_state=42)

parameters = {
    'penalty' : ['l1', 'l2'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'tol' : [1e-4, 1e-3, 1e-2, 1e-1]
}

clf = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='neg_mean_absolute_error')
clf.fit(X_train, y_train)

In [30]:
clf.best_params_

{'alpha': 0.001, 'penalty': 'l2', 'tol': 0.0001}

In [31]:
from sklearn.metrics import mean_absolute_error
y_pred = clf.predict(X_test)
mean_absolute_error(y_test, y_pred)

3.8131121797994014

### Lasso Pipeline

In [32]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

In [33]:
pipe = Pipeline([
    ('pca', PCA()),
    ('lasso', Lasso())
])


parameters = {
    'pca__n_components': [0.9, 0.95],
    'lasso__alpha': [10, 1, 0.01, 0.001]
}

clf = GridSearchCV(estimator=pipe,
                   param_grid=parameters,
                   cv=5, scoring='neg_mean_absolute_error',
                   n_jobs=-1)
clf.fit(X_train, y_train)
clf.best_params_

{'lasso__alpha': 0.01, 'pca__n_components': 0.95}

In [34]:
from sklearn.metrics import r2_score

y_pred = clf.predict(X_test)
r2_score(y_test, y_pred)

0.6288625430197571

In [35]:
# Find eigen value of first component
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
pca.explained_variance_

array([1.16350757e+00, 1.51097446e-01, 1.13215850e-01, 7.01973446e-02,
       3.86638789e-02, 2.77066927e-02, 2.51577221e-02, 2.00974792e-02,
       1.75339483e-02, 1.14959742e-02, 1.02977735e-02, 9.11833250e-03,
       5.54734000e-03, 3.11646023e-32])

#### Polynomial Feature Pipeline

In [36]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('poly', PolynomialFeatures(interaction_only=False,degree=2)),
    ('lasso',Lasso(alpha=1,warm_start=True,random_state=0)),
])

pipe.fit(X_train,y_train)

In [37]:
from sklearn.metrics import r2_score
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.157678032410551

In [48]:
""" 
If you eliminate 1 feature with recursive feature elimination, which feature will be eliminated?
Type the index of the eliminated feature (index starts from 0).
Use  LinearRegression model with default parameters as an estimator.
Use processed training data.
"""

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


rfe = RFE(estimator=LinearRegression(), n_features_to_select=1)
rfe.fit(X_train, y_train)

rfe.ranking_

array([10,  8, 14, 13,  4,  6,  5,  3,  7,  1,  2, 11,  9, 12])

In [49]:
X_train.shape

(2800, 14)