## Separate noise from features

The file  𝚖𝚢𝚜𝚝𝚎𝚛𝚢.𝚍𝚊𝚝  contains pairs  (𝑥,𝑦) , where  𝑥∈ℝ$^{100}$  and  𝑦∈ℝ . There is one data point per line, with comma-separated values; the very last number in each line is the  𝑦 -value.

In this data set,  𝑦  is a linear function of just ten of the features in  𝑥 , plus some noise. The problem is to identify those ten features.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv('./mystery.dat', sep = ',',header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-1.14558,-1.29249,0.84911,0.36008,0.26068,2.51167,2.31855,0.60805,0.3428,-0.28903,...,0.16212,-0.60644,1.57021,-1.40166,-0.65263,-0.08357,0.78098,1.5532,-1.42135,1.19238
1,1.38724,-1.00201,-0.3337,0.32802,0.1586,-0.5706,-0.05989,0.91503,0.00859,-2.01566,...,-0.07138,-0.12137,1.20241,-2.84456,-1.17875,0.0449,0.59131,0.81903,0.39286,-3.44094
2,1.47233,0.8488,-0.33866,-0.45366,0.9859,1.31009,-0.71319,-0.03236,-0.38447,0.72138,...,-0.62333,1.74732,0.06279,0.62084,-0.9935,-0.10799,-0.87872,0.08911,-1.72476,3.75006
3,-1.3405,-0.90731,2.07849,-1.18061,-3.23561,0.37517,-0.05829,0.38018,2.28377,0.18249,...,-0.5136,-0.95735,0.91067,-0.36184,0.36378,2.13123,0.00686,0.68222,0.79878,-8.60734
4,0.11286,-0.72004,0.04818,0.34493,0.49831,0.10448,0.37044,-1.46027,0.77638,-1.9551,...,0.25205,1.25199,-0.19591,0.44213,-2.14802,-1.96095,-0.9568,1.07276,0.28898,0.28307


---
Since 𝑦 is linear function of 𝑥, I'll use regularization to extract the most important features, which should ideally separate the noise from features

In [3]:
from sklearn.linear_model import Lasso

In [4]:
lasso1=Lasso().fit(data.iloc[:,:100].values,data.iloc[:,100].values)
print("Number of features used: {}".format(np.sum(lasso1.coef_ != 0)))

Number of features used: 4


In [5]:
for i in [1., 0.8, 0.7, 0.6, 0.5]:
    lasso=Lasso(alpha=i).fit(data.iloc[:,:100].values,data.iloc[:,100].values)
    print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Number of features used: 4
Number of features used: 8
Number of features used: 8
Number of features used: 9
Number of features used: 11


In [6]:
for i in [0.6, 0.575, 0.55, 0.525, 0.5]:
    lasso=Lasso(alpha=i).fit(data.iloc[:,:100].values,data.iloc[:,100].values)
    print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Number of features used: 9
Number of features used: 11
Number of features used: 11
Number of features used: 11
Number of features used: 11


In [7]:
for i in [0.6,0.59,0.58, 0.57]:
    lasso=Lasso(alpha=i).fit(data.iloc[:,:100].values,data.iloc[:,100].values)
    print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Number of features used: 9
Number of features used: 10
Number of features used: 10
Number of features used: 11


In [8]:
lasso=Lasso(alpha=0.59).fit(data.iloc[:,:100].values,data.iloc[:,100].values)
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Number of features used: 10


In [9]:
np.array(lasso.coef_)

array([ 0.        ,  0.10299376,  0.00652474,  0.        ,  0.2893491 ,
       -0.        ,  0.29522954, -0.        , -0.        , -0.        ,
        0.42515876, -0.        ,  0.25173601,  0.        , -0.        ,
       -0.        ,  0.56205934, -0.        ,  0.39238243,  0.        ,
        0.        , -0.        ,  0.40961863,  0.        , -0.        ,
       -0.        , -0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        , -0.        ,
       -0.        , -0.        ,  0.        , -0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.        , -0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        ,  0.        , -0.        ,
        0.        , -0.        , -0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.        , -0.        ,
        0.        , -0.        , -0.        ,  0.        ,  0.  

In [10]:
np.array(-lasso.coef_).argsort()+1

array([ 17,  11,  23,  19,   7,   5,  13,   2,  81,   3,  68,  69,   1,
        67,  72,  73,  74,  70,  71,  65,  75,  64,  63,  62,  61,  60,
        59,  58,  57,  66,  76,  82,  78,  98,  97,  96,  95,  94,  93,
        92,  91,  90,  89,  88,  87,  86,  85,  84,  83,  56,  80,  79,
        77,  55,  50,  53,  29,  28,  27,  26,  25,  24,  22,  21,  20,
        18,  16,  15,  14,  12,  10,   9,   8,   6,   4,  30,  54,  31,
        33,  52,  51,  99,  49,  48,  47,  46,  45,  44,  43,  42,  41,
        40,  39,  38,  37,  36,  35,  34,  32, 100])

In [11]:
features=[count+1 for count, elem in enumerate(lasso.coef_) if elem != 0]
print("Indices that aren't noise: {}".format(features))

Indices that aren't noise: [2, 3, 5, 7, 11, 13, 17, 19, 23, 81]
