In [1]:
cd ..

/home/scott/Documents/git/bite-me


In [2]:
import sklearn
import pandas as pd
import numpy as np
import datetime

In [3]:
from IPython.display import HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

  "The Gtk3Agg backend is known to not work on Python 3.x with pycairo. "


In [4]:
import sklearn.linear_model
import sklearn.cross_validation

In [5]:
import sklearn.decomposition
import sklearn.manifold

In [6]:
from eat_it import StratifiedPercentileKFold
from eat_it import scalers
from eat_it import params

In [7]:
import imp

In [8]:
def do_cv(cv, clf, X, y, y_transform=None):
    if y_transform is None:
        y_transform = lambda x: x
    scores = []
    all_pred = np.zeros((y.shape))
    for train_index, test_index in cv:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # Fit model
        clf.fit(X_train, y_train)
        # Predict scores for test data
        y_pred = y_transform(clf.predict(X_test))
        # Save all the predictions to an array
        all_pred[test_index] = y_pred
        # Compute mean squared error on this test set
        mse = sklearn.metrics.mean_squared_error(y_transform(y_test),y_pred)
        scores.append(mse)
    # Compute MSE across all samples
    all_score = sklearn.metrics.mean_squared_error(y_transform(y), all_pred)**0.5
    scores = np.asarray(scores)**0.5
    return all_score, scores

In [9]:
def get_cv_results(clf, X, y, n_folds=10, n_samp=25):
    all_scores = []
    for seed in range(n_samp):
        cv = StratifiedPercentileKFold.StratifiedPercentileKFold(y, n_folds=n_folds, shuffle=True, random_state=seed, shuffle_windows=True)
        this_score, _ = do_cv(cv, clf, X=X, y=y)
        all_scores.append(this_score)
    return np.mean(all_scores), np.std(all_scores)/np.sqrt(n_folds)

def get_mean_cv_score(*args, **kwargs):
    out = get_cv_results(*args, **kwargs)
    return out[0]

In [10]:
### Recursive feature addition
from sklearn.base import clone

def rfa(clf, X, y, n_folds=10, n_samp=25, col_names=None, verbosity=2):
    
    n_features = X.shape[1]
    n_features_to_select = n_features
    step = 1
    
    if col_names is None:
        col_names = range(n_features)
    col_names = np.asarray(col_names)
    
    support_ = np.zeros(n_features, dtype=np.bool)
    ranking_ = n_features * np.ones(n_features, dtype=np.int)
    last_score = None
    
    # Feature addition
    while np.sum(support_) < n_features_to_select:
        # Previously added features
        features_already = np.arange(n_features)[support_]
        # Features to test
        features_to_test = np.arange(n_features)[np.logical_not(support_)]
        
        # Rank the remaining features
        estimator = clone(clf)
        
        #####################################
        # FIT THE CLASSIFIER ON A NESTED FOLD
        #####################################
        
        scores = np.zeros(len(features_to_test))
        for feature_index, test_feature in enumerate(features_to_test):
            features = np.union1d(features_already, [test_feature])
            scores[feature_index] = get_mean_cv_score(estimator, X[:, features], y, n_folds=n_folds, n_samp=n_samp)
            if verbosity>=2:
                print("\tScored %.2f with %s" % (scores[feature_index], ', '.join(col_names[features])))
        
        # Sort the scores in ascending order
        score_order_index = np.argsort(scores)
        ordered_scores   = scores[score_order_index]
        ordered_features = features_to_test[score_order_index]
        
        # Break if no features can improve score
        if last_score is not None and last_score < ordered_scores[0]:
            if verbosity:
                print('No more improvement possible from {} to {} features'.format(
                        len(features_already),len(features_already)+1))
            break
        
        # Only add `step` many features if it doesn't take us past the target
        n_add = min(step, n_features_to_select - np.sum(support_))
        
        # Only add features which don't make performance go down
        if last_score is not None:
            n_add = min(n_add, len(np.nonzero(ordered_scores < last_score)))
        
        # Select best.
        # We will MINIMISE scoring function!!!
        features_to_add = ordered_features[0:n_add]
        for i in range(n_add):
            if verbosity:
                print('Adding feature {} (scored {})'.format(col_names[ordered_features[i]], ordered_scores[i]))
        
        # Add the features
        support_[features_to_add] = True
        ranking_[features_to_add] = np.sum(support_) + 1 + np.arange(features_to_add)
        
        # Update score monitor
        last_score = ordered_scores[0]
        
    if verbosity:
        print("Best score is {} with features:\n\t{}".format(last_score,', '.join(col_names[support_])))
    
    return support_, ranking_

In [11]:
train = pd.read_csv('data/train.csv', encoding="utf-8")
# Add age in days
end_dt = datetime.datetime.strptime('2015-1-1', "%Y-%m-%d")
train['Age'] = [(end_dt - datetime.datetime.strptime(open_dt, "%m/%d/%Y")).days for open_dt in train['Open Date']]
# add size as boolean field
train['isBig'] = train['City Group']=='Big Cities'
# add each of the big cities as boolean field
train['isIstanbul'] = train['City']=='İstanbul'
train['isAnkara'] = train['City']=='Ankara'
train['isIzmir'] = train['City']=='İzmir'
# add boolean field for type
train['isIL'] = train['Type']=='IL'
# Note when there is the missing 17 fields
train['missingSource'] = train[params.xor_cols].apply(lambda x: np.all(x==0), axis=1)

In [12]:
gtest = pd.read_csv('data/genuinetest.csv', encoding="utf-8")
# Add age in days
end_dt = datetime.datetime.strptime('2015-1-1', "%Y-%m-%d")
gtest['Age'] = [(end_dt - datetime.datetime.strptime(open_dt, "%m/%d/%Y")).days for open_dt in gtest['Open Date']]
# add size as boolean field
gtest['isBig'] = gtest['City Group']=='Big Cities'
# add each of the big cities as boolean field
gtest['isIstanbul'] = gtest['City']=='İstanbul'
gtest['isAnkara'] = gtest['City']=='Ankara'
gtest['isIzmir'] = gtest['City']=='İzmir'
# add boolean field for type
gtest['isIL'] = gtest['Type']=='IL'
# Note when there is the missing 17 fields
gtest['missingSource'] = gtest[params.xor_cols].apply(lambda x: np.all(x==0), axis=1)

In [13]:
# Merge Test and Train together, without having revenue for all entries
unlabelled_data = pd.concat((train, gtest), ignore_index=True)

In [14]:
# Remove DT type from the dataset
unlabelled_data = unlabelled_data[unlabelled_data['Type']!='DT']

In [15]:
# Add known revenues from public test data
gtestrevenue = pd.read_csv('data/genuinetestrevenue.csv', encoding="utf-8")
labelled_test = pd.merge(gtest, gtestrevenue, on='Id')
# Merge all available training data together
data = pd.concat((train, labelled_test), ignore_index=True)

In [16]:
# Remove DT type from the list
data = data[data['Type']!='DT']

In [17]:
# Assemble list of columns
Pcols = ['P'+str(i) for i in range(1,38)]
PMcols = params.xor_cols
PVcols = [i for i in Pcols if i not in params.xor_cols]
Gcols = ['Age']
Ocols = ['isBig','isIstanbul','isAnkara','isIzmir','isIL','missingSource']
cols = Pcols + Gcols + Ocols

In [18]:
# Targets
y = data['revenue'].values

In [19]:
X_indices = data['Id'].values

In [20]:
uX_indices = unlabelled_data['Id'].values

In [21]:
index_is_labelled = [i in X_indices for i in uX_indices]
index_is_labelled = np.asarray(index_is_labelled)

In [22]:
np.array_equal(uX_indices[index_is_labelled], X_indices)

True

In [23]:
unlabelled_data_nomissing = np.logical_not(unlabelled_data['missingSource'].values)
data_nomissing = np.logical_not(data['missingSource'].values)

In [24]:
# Other (already one-hot columns) can stay as they are
XO = data.as_matrix(Ocols).astype(np.float)

# Need to take logs because sometimes Age can't be mapped correctly by BoxCox
u = np.log(unlabelled_data.as_matrix(Gcols).astype(np.float))
d = np.log(data.as_matrix(Gcols).astype(np.float))
XG = scalers.BoxCoxScaler().fit(u).transform(d)

# Valid-always columns
u = unlabelled_data.as_matrix(PVcols).astype(np.float)
d = data.as_matrix(PVcols).astype(np.float)
s = scalers.BoxCoxScaler().fit(u)
XPV = s.transform(d)
uXPV = s.transform(u)

# Missing-sometimes columns
u = unlabelled_data.as_matrix(PMcols).astype(np.float)[unlabelled_data_nomissing]
d = data.as_matrix(PMcols).astype(np.float)
s = scalers.BoxCoxScaler(known_min=0).fit(u)
XPM = s.transform(d)
uXPM = s.transform(u)

# All columns
XPA = np.concatenate((XPV,XPM),axis=1)

##Model testing

Linear Regression
```
2252669 BoxCox, no dim reduction
2218005 PCA   2229214 with extra reduction
2231867 ICA   2214917 with extra reduction
2210257 NMF   2206676 with extra reduction
2277587 FA    2277545 with extra reduction
```

In [54]:
# Only valid columns, and no dimensionality reduction
X_ = np.concatenate((XPV, XG, XO), axis=1)
cols_ = PVcols + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P19', 'P20', 'P21', 'P22', 'P23', 'P28', 'P29', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2493883.02 with P1
	Scored 2459371.14 with P2
	Scored 2520706.57 with P3
	Scored 2517812.42 with P4
	Scored 2523299.81 with P5
	Scored 2465725.19 with P6
	Scored 2509271.38 with P7
	Scored 2501598.90 with P8
	Scored 2515763.28 with P9
	Scored 2513614.27 with P10
	Scored 2498809.36 with P11
	Scored 2517824.99 with P12
	Scored 2515057.29 with P13
	Scored 2530112.02 with P19
	Scored 2504898.56 with P20
	Scored 2489590.07 with P21
	Scored 2522997.84 with P22
	Scored 2523959.68 with P23
	Scored 2443131.66 with P28
	Scored 2488209.60 with P29
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feature

In [55]:
# PCA on all-samples valid-columns. Without whitening. Fit only to labelled.

XPV_ = sklearn.decomposition.PCA().fit_transform(XPV)
PVcols_ = ['PV_PCA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_PCA_0', 'PV_PCA_1', 'PV_PCA_2', 'PV_PCA_3', 'PV_PCA_4', 'PV_PCA_5', 'PV_PCA_6', 'PV_PCA_7', 'PV_PCA_8', 'PV_PCA_9', 'PV_PCA_10', 'PV_PCA_11', 'PV_PCA_12', 'PV_PCA_13', 'PV_PCA_14', 'PV_PCA_15', 'PV_PCA_16', 'PV_PCA_17', 'PV_PCA_18', 'PV_PCA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2516380.11 with PV_PCA_0
	Scored 2426867.77 with PV_PCA_1
	Scored 2517777.00 with PV_PCA_2
	Scored 2500838.17 with PV_PCA_3
	Scored 2509691.74 with PV_PCA_4
	Scored 2524592.50 with PV_PCA_5
	Scored 2507348.25 with PV_PCA_6
	Scored 2528131.73 with PV_PCA_7
	Scored 2497650.08 with PV_PCA_8
	Scored 2515095.22 with PV_PCA_9
	Scored 2491133.00 with PV_PCA_10
	Scored 2547004.53 with PV_PCA_11
	Scored 2516129.93 with PV_PCA_12
	Scored 2518222.95 with PV_PCA_13
	Scored 2521307.95 with PV_PCA_14
	Scored 2507892.55 with PV_PCA_15
	Scored 2531250.49 with PV_PCA_16
	Scored 2519919.81 with PV_PCA_17
	Scored 2525499.82 with PV_PCA_18
	Scored 2520299.41 with PV_PCA_19
	

In [79]:
# PCA on all-samples valid-columns. Without whitening. Fit to unlabelled.

XPV_ = sklearn.decomposition.PCA().fit(uXPV).transform(XPV)
PVcols_ = ['PV_PCA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_PCA_0', 'PV_PCA_1', 'PV_PCA_2', 'PV_PCA_3', 'PV_PCA_4', 'PV_PCA_5', 'PV_PCA_6', 'PV_PCA_7', 'PV_PCA_8', 'PV_PCA_9', 'PV_PCA_10', 'PV_PCA_11', 'PV_PCA_12', 'PV_PCA_13', 'PV_PCA_14', 'PV_PCA_15', 'PV_PCA_16', 'PV_PCA_17', 'PV_PCA_18', 'PV_PCA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2515854.65 with PV_PCA_0
	Scored 2432691.76 with PV_PCA_1
	Scored 2514604.63 with PV_PCA_2
	Scored 2504366.18 with PV_PCA_3
	Scored 2504635.01 with PV_PCA_4
	Scored 2504714.07 with PV_PCA_5
	Scored 2520321.90 with PV_PCA_6
	Scored 2515359.06 with PV_PCA_7
	Scored 2496477.89 with PV_PCA_8
	Scored 2534148.44 with PV_PCA_9
	Scored 2489775.16 with PV_PCA_10
	Scored 2522229.07 with PV_PCA_11
	Scored 2530040.82 with PV_PCA_12
	Scored 2502651.92 with PV_PCA_13
	Scored 2537806.52 with PV_PCA_14
	Scored 2539661.06 with PV_PCA_15
	Scored 2498294.92 with PV_PCA_16
	Scored 2514376.42 with PV_PCA_17
	Scored 2527347.67 with PV_PCA_18
	Scored 2516217.60 with PV_PCA_19
	

In [58]:
# PCA on all-samples valid-columns. Without whitening, with limited number of components. Fit only to labelled.

XPV_ = sklearn.decomposition.PCA(11).fit_transform(XPV)
PVcols_ = ['PV_PCA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_PCA_0', 'PV_PCA_1', 'PV_PCA_2', 'PV_PCA_3', 'PV_PCA_4', 'PV_PCA_5', 'PV_PCA_6', 'PV_PCA_7', 'PV_PCA_8', 'PV_PCA_9', 'PV_PCA_10', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2516380.11 with PV_PCA_0
	Scored 2426867.77 with PV_PCA_1
	Scored 2517777.00 with PV_PCA_2
	Scored 2500838.17 with PV_PCA_3
	Scored 2509691.74 with PV_PCA_4
	Scored 2524592.50 with PV_PCA_5
	Scored 2507348.25 with PV_PCA_6
	Scored 2528131.73 with PV_PCA_7
	Scored 2497650.08 with PV_PCA_8
	Scored 2515095.22 with PV_PCA_9
	Scored 2491133.00 with PV_PCA_10
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feature Age (scored 2383281.7835882343)
	Scored 2388303.47 with PV_PCA_0, Age
	Scored 2350895.24 with PV_PCA_1, Age
	Scored 2391208.97 with PV_PCA_2, Age
	Scored 2378190.69 with PV_PCA_3, Age


In [56]:
# PCA on all-samples valid-columns. With whitening. Fit only to labelled.

XPV_ = sklearn.decomposition.PCA(whiten=True).fit_transform(XPV)
PVcols_ = ['PV_PCA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_PCA_0', 'PV_PCA_1', 'PV_PCA_2', 'PV_PCA_3', 'PV_PCA_4', 'PV_PCA_5', 'PV_PCA_6', 'PV_PCA_7', 'PV_PCA_8', 'PV_PCA_9', 'PV_PCA_10', 'PV_PCA_11', 'PV_PCA_12', 'PV_PCA_13', 'PV_PCA_14', 'PV_PCA_15', 'PV_PCA_16', 'PV_PCA_17', 'PV_PCA_18', 'PV_PCA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2516380.11 with PV_PCA_0
	Scored 2426867.77 with PV_PCA_1
	Scored 2517777.00 with PV_PCA_2
	Scored 2500838.17 with PV_PCA_3
	Scored 2509691.74 with PV_PCA_4
	Scored 2524592.50 with PV_PCA_5
	Scored 2507348.25 with PV_PCA_6
	Scored 2528131.73 with PV_PCA_7
	Scored 2497650.08 with PV_PCA_8
	Scored 2515095.22 with PV_PCA_9
	Scored 2491133.00 with PV_PCA_10
	Scored 2547004.53 with PV_PCA_11
	Scored 2516129.93 with PV_PCA_12
	Scored 2518222.95 with PV_PCA_13
	Scored 2521307.95 with PV_PCA_14
	Scored 2507892.55 with PV_PCA_15
	Scored 2531250.49 with PV_PCA_16
	Scored 2519919.81 with PV_PCA_17
	Scored 2525499.82 with PV_PCA_18
	Scored 2520299.41 with PV_PCA_19
	

In [64]:
# Test ICA
XPV_ = sklearn.decomposition.FastICA(random_state=888, max_iter=1000, tol=0.000001).fit_transform(XPV)
print(XPV_)

[[-0.03771645  0.09976607  0.07139857 ...,  0.00619079  0.01141806
   0.07637586]
 [ 0.00689887 -0.03413443  0.00594177 ..., -0.02749194  0.00466889
   0.01985834]
 [ 0.05327798 -0.18294115  0.1219419  ...,  0.12606563  0.26112609
  -0.01409697]
 ..., 
 [ 0.01462913 -0.04759788 -0.17676227 ..., -0.07708416  0.12192099
   0.03059742]
 [ 0.00521671 -0.0708876   0.02577469 ...,  0.0261287   0.04975384 -0.21426   ]
 [-0.01463396  0.25393438  0.00088567 ...,  0.00470247  0.00490993
  -0.07455026]]


In [65]:
# ICA on all-samples valid-columns. Fit only to labelled.

XPV_ = sklearn.decomposition.FastICA(random_state=888, max_iter=1000, tol=0.000001).fit_transform(XPV)
PVcols_ = ['PV_ICA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_ICA_0', 'PV_ICA_1', 'PV_ICA_2', 'PV_ICA_3', 'PV_ICA_4', 'PV_ICA_5', 'PV_ICA_6', 'PV_ICA_7', 'PV_ICA_8', 'PV_ICA_9', 'PV_ICA_10', 'PV_ICA_11', 'PV_ICA_12', 'PV_ICA_13', 'PV_ICA_14', 'PV_ICA_15', 'PV_ICA_16', 'PV_ICA_17', 'PV_ICA_18', 'PV_ICA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2513859.80 with PV_ICA_0
	Scored 2523132.15 with PV_ICA_1
	Scored 2523727.81 with PV_ICA_2
	Scored 2513457.83 with PV_ICA_3
	Scored 2529469.77 with PV_ICA_4
	Scored 2495683.14 with PV_ICA_5
	Scored 2552623.37 with PV_ICA_6
	Scored 2524405.21 with PV_ICA_7
	Scored 2513611.08 with PV_ICA_8
	Scored 2498478.53 with PV_ICA_9
	Scored 2499535.77 with PV_ICA_10
	Scored 2490760.11 with PV_ICA_11
	Scored 2517164.37 with PV_ICA_12
	Scored 2524223.07 with PV_ICA_13
	Scored 2484265.94 with PV_ICA_14
	Scored 2496325.53 with PV_ICA_15
	Scored 2492669.85 with PV_ICA_16
	Scored 2525021.39 with PV_ICA_17
	Scored 2523580.43 with PV_ICA_18
	Scored 2504027.15 with PV_ICA_19
	

In [91]:
# ICA on all-samples valid-columns. Fit to unlabelled.

XPV_ = sklearn.decomposition.FastICA(random_state=889, max_iter=10000, tol=0.000001).fit(uXPV).transform(XPV)
PVcols_ = ['PV_ICA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_ICA_0', 'PV_ICA_1', 'PV_ICA_2', 'PV_ICA_3', 'PV_ICA_4', 'PV_ICA_5', 'PV_ICA_6', 'PV_ICA_7', 'PV_ICA_8', 'PV_ICA_9', 'PV_ICA_10', 'PV_ICA_11', 'PV_ICA_12', 'PV_ICA_13', 'PV_ICA_14', 'PV_ICA_15', 'PV_ICA_16', 'PV_ICA_17', 'PV_ICA_18', 'PV_ICA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2491837.74 with PV_ICA_0
	Scored 2535955.29 with PV_ICA_1
	Scored 2526304.08 with PV_ICA_2
	Scored 2530963.57 with PV_ICA_3
	Scored 2519724.21 with PV_ICA_4
	Scored 2513804.13 with PV_ICA_5
	Scored 2509584.28 with PV_ICA_6
	Scored 2477028.65 with PV_ICA_7
	Scored 2523296.00 with PV_ICA_8
	Scored 2451557.00 with PV_ICA_9
	Scored 2530396.14 with PV_ICA_10
	Scored 2503745.31 with PV_ICA_11
	Scored 2514098.68 with PV_ICA_12
	Scored 2529776.57 with PV_ICA_13
	Scored 2480412.85 with PV_ICA_14
	Scored 2509492.10 with PV_ICA_15
	Scored 2516991.61 with PV_ICA_16
	Scored 2521564.20 with PV_ICA_17
	Scored 2520423.09 with PV_ICA_18
	Scored 2495685.48 with PV_ICA_19
	

In [66]:
# ICA on all-samples valid-columns. Fit only to labelled.

XPV_ = sklearn.decomposition.FastICA(random_state=888, max_iter=1000, tol=0.000001, algorithm='deflation').fit_transform(XPV)
PVcols_ = ['PV_ICA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_ICA_0', 'PV_ICA_1', 'PV_ICA_2', 'PV_ICA_3', 'PV_ICA_4', 'PV_ICA_5', 'PV_ICA_6', 'PV_ICA_7', 'PV_ICA_8', 'PV_ICA_9', 'PV_ICA_10', 'PV_ICA_11', 'PV_ICA_12', 'PV_ICA_13', 'PV_ICA_14', 'PV_ICA_15', 'PV_ICA_16', 'PV_ICA_17', 'PV_ICA_18', 'PV_ICA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2513857.04 with PV_ICA_0
	Scored 2486440.58 with PV_ICA_1
	Scored 2497043.72 with PV_ICA_2
	Scored 2500410.79 with PV_ICA_3
	Scored 2525276.30 with PV_ICA_4
	Scored 2558976.17 with PV_ICA_5
	Scored 2522572.66 with PV_ICA_6
	Scored 2477600.62 with PV_ICA_7
	Scored 2523155.05 with PV_ICA_8
	Scored 2479571.12 with PV_ICA_9
	Scored 2523274.37 with PV_ICA_10
	Scored 2485894.30 with PV_ICA_11
	Scored 2535829.83 with PV_ICA_12
	Scored 2513890.02 with PV_ICA_13
	Scored 2504042.38 with PV_ICA_14
	Scored 2522736.66 with PV_ICA_15
	Scored 2506554.77 with PV_ICA_16
	Scored 2513123.61 with PV_ICA_17
	Scored 2537381.20 with PV_ICA_18
	Scored 2525326.44 with PV_ICA_19
	

In [71]:
XPV_ = sklearn.decomposition.NMF(random_state=888, tol=0.000001, max_iter=1000).fit_transform(XPV - np.min(XPV))
print(XPV_)

[[ 0.57568771  0.33860115  0.35263163 ...,  0.26454643  0.23430641
   0.35568096]
 [ 0.4486066   0.02531565 -0.         ...,  0.28192865  0.19241708
   0.23715191]
 [-0.          0.41058876  0.54406539 ...,  0.16026164  0.37787176
   0.03182253]
 ..., 
 [ 0.3138476   0.70163067  0.33452005 ...,  0.33706847  0.01719918
   0.09156618]
 [ 0.93905383  0.36452295  0.04077177 ...,  0.30412426  0.20868522
   0.28186803]
 [ 0.68934924  0.27296394 -0.         ...,  0.18592145  0.30929555
   0.52623606]]


In [93]:
# NMF on all-samples valid-columns. Fit to unlabelled.

XPV_ = sklearn.decomposition.NMF(random_state=888, tol=0.000001, max_iter=1000).fit(uXPV - np.min(uXPV)).transform(XPV - np.min(uXPV))
PVcols_ = ['PV_NMF_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_NMF_0', 'PV_NMF_1', 'PV_NMF_2', 'PV_NMF_3', 'PV_NMF_4', 'PV_NMF_5', 'PV_NMF_6', 'PV_NMF_7', 'PV_NMF_8', 'PV_NMF_9', 'PV_NMF_10', 'PV_NMF_11', 'PV_NMF_12', 'PV_NMF_13', 'PV_NMF_14', 'PV_NMF_15', 'PV_NMF_16', 'PV_NMF_17', 'PV_NMF_18', 'PV_NMF_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2518445.68 with PV_NMF_0
	Scored 2494830.66 with PV_NMF_1
	Scored 2515661.54 with PV_NMF_2
	Scored 2495176.05 with PV_NMF_3
	Scored 2485745.84 with PV_NMF_4
	Scored 2497565.08 with PV_NMF_5
	Scored 2526566.55 with PV_NMF_6
	Scored 2477678.64 with PV_NMF_7
	Scored 2503158.14 with PV_NMF_8
	Scored 2453237.46 with PV_NMF_9
	Scored 2385725.88 with PV_NMF_10
	Scored 2522570.15 with PV_NMF_11
	Scored 2524665.81 with PV_NMF_12
	Scored 2532133.32 with PV_NMF_13
	Scored 2484879.31 with PV_NMF_14
	Scored 2504678.90 with PV_NMF_15
	Scored 2508540.87 with PV_NMF_16
	Scored 2503015.06 with PV_NMF_17
	Scored 2461689.84 with PV_NMF_18
	Scored 2521339.02 with PV_NMF_19
	

In [25]:
# NMF on all-samples valid-columns. Fit to unlabelled.
# WITHOUT BOXCOX

u = unlabelled_data.as_matrix(PVcols).astype(np.float)
d = data.as_matrix(PVcols).astype(np.float)

XPV_ = sklearn.decomposition.NMF(random_state=888, tol=0.000001, max_iter=1000).fit(u - np.min(u)).transform(d - np.min(u))
PVcols_ = ['PV_NMF_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_NMF_0', 'PV_NMF_1', 'PV_NMF_2', 'PV_NMF_3', 'PV_NMF_4', 'PV_NMF_5', 'PV_NMF_6', 'PV_NMF_7', 'PV_NMF_8', 'PV_NMF_9', 'PV_NMF_10', 'PV_NMF_11', 'PV_NMF_12', 'PV_NMF_13', 'PV_NMF_14', 'PV_NMF_15', 'PV_NMF_16', 'PV_NMF_17', 'PV_NMF_18', 'PV_NMF_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2514506.37 with PV_NMF_0
	Scored 2520224.62 with PV_NMF_1
	Scored 2509838.59 with PV_NMF_2
	Scored 2514170.07 with PV_NMF_3
	Scored 2510917.94 with PV_NMF_4
	Scored 2520684.02 with PV_NMF_5
	Scored 2519986.97 with PV_NMF_6
	Scored 2509932.69 with PV_NMF_7
	Scored 2517668.80 with PV_NMF_8
	Scored 2514304.72 with PV_NMF_9
	Scored 2404801.38 with PV_NMF_10
	Scored 2460787.22 with PV_NMF_11
	Scored 2515165.72 with PV_NMF_12
	Scored 2500168.66 with PV_NMF_13
	Scored 2509122.55 with PV_NMF_14
	Scored 2499333.54 with PV_NMF_15
	Scored 2494203.46 with PV_NMF_16
	Scored 2431777.15 with PV_NMF_17
	Scored 2500109.65 with PV_NMF_18
	Scored 2515283.69 with PV_NMF_19
	



In [76]:
# Test factor analysis
XPV_ = sklearn.decomposition.FactorAnalysis(random_state=888, tol=0.000001).fit_transform(XPV)
print(XPV_)

[[ 0.04193591 -0.11373953 -0.00969933 ...,  0.          0.          0.        ]
 [-0.10929131  0.28354121 -1.13489388 ...,  0.          0.          0.        ]
 [-0.69732988  1.14220017  0.72224463 ...,  0.          0.          0.        ]
 ..., 
 [-0.89479306 -0.3184298   1.5353142  ...,  0.          0.          0.        ]
 [-0.11870056 -2.06934791 -0.34258743 ...,  0.          0.          0.        ]
 [-0.39261593 -0.02316254 -0.06038742 ...,  0.          0.          0.        ]]


In [77]:
# Factor Analysis on all-samples valid-columns. Fit only to labelled.

XPV_ = sklearn.decomposition.FactorAnalysis(random_state=888, tol=0.000001).fit_transform(XPV)
PVcols_ = ['PV_FA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_FA_0', 'PV_FA_1', 'PV_FA_2', 'PV_FA_3', 'PV_FA_4', 'PV_FA_5', 'PV_FA_6', 'PV_FA_7', 'PV_FA_8', 'PV_FA_9', 'PV_FA_10', 'PV_FA_11', 'PV_FA_12', 'PV_FA_13', 'PV_FA_14', 'PV_FA_15', 'PV_FA_16', 'PV_FA_17', 'PV_FA_18', 'PV_FA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2517425.54 with PV_FA_0
	Scored 2426287.78 with PV_FA_1
	Scored 2514142.91 with PV_FA_2
	Scored 2513037.11 with PV_FA_3
	Scored 2500729.16 with PV_FA_4
	Scored 2525950.96 with PV_FA_5
	Scored 2524077.61 with PV_FA_6
	Scored 2511685.11 with PV_FA_7
	Scored 2511685.11 with PV_FA_8
	Scored 2511685.11 with PV_FA_9
	Scored 2511685.11 with PV_FA_10
	Scored 2511685.11 with PV_FA_11
	Scored 2511685.11 with PV_FA_12
	Scored 2511685.11 with PV_FA_13
	Scored 2511685.11 with PV_FA_14
	Scored 2511685.11 with PV_FA_15
	Scored 2511685.11 with PV_FA_16
	Scored 2511685.11 with PV_FA_17
	Scored 2511685.11 with PV_FA_18
	Scored 2511685.11 with PV_FA_19
	Scored 2383281.78 with Age
	Scored 24210

Lies! It saturated at `2277587.2551330333` when it reached

```PV_FA_1, Age, isIstanbul, isIzmir, isIL```

In [97]:
# Factor Analysis on all-samples valid-columns. Fit to unlabelled.

XPV_ = sklearn.decomposition.FactorAnalysis(random_state=888, tol=0.000001).fit(uXPV).transform(XPV)
PVcols_ = ['PV_FA_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_FA_0', 'PV_FA_1', 'PV_FA_2', 'PV_FA_3', 'PV_FA_4', 'PV_FA_5', 'PV_FA_6', 'PV_FA_7', 'PV_FA_8', 'PV_FA_9', 'PV_FA_10', 'PV_FA_11', 'PV_FA_12', 'PV_FA_13', 'PV_FA_14', 'PV_FA_15', 'PV_FA_16', 'PV_FA_17', 'PV_FA_18', 'PV_FA_19', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2517112.57 with PV_FA_0
	Scored 2429884.53 with PV_FA_1
	Scored 2503046.32 with PV_FA_2
	Scored 2522935.31 with PV_FA_3
	Scored 2492136.32 with PV_FA_4
	Scored 2531547.48 with PV_FA_5
	Scored 2504655.25 with PV_FA_6
	Scored 2511685.11 with PV_FA_7
	Scored 2511685.11 with PV_FA_8
	Scored 2511685.11 with PV_FA_9
	Scored 2511685.11 with PV_FA_10
	Scored 2511685.11 with PV_FA_11
	Scored 2511685.11 with PV_FA_12
	Scored 2511685.11 with PV_FA_13
	Scored 2511685.11 with PV_FA_14
	Scored 2511685.11 with PV_FA_15
	Scored 2511685.11 with PV_FA_16
	Scored 2511685.11 with PV_FA_17
	Scored 2511685.11 with PV_FA_18
	Scored 2511685.11 with PV_FA_19
	Scored 2383281.78 with Age
	Scored 24210

Lies! It saturated at `2277545.3774454724` when it reached

```PV_FA_1, PV_FA_6, Age, isIstanbul, isIzmir, isIL```

### Manifold reduction

```
2252669 BoxCox, no dim reduction
2229214 PCA
2214917 ICA
2206676 NMF
2277545 FA
2270933 Isomap (5)
2284557 LLE (4)
2283485 LLE (5)
2234634 LLE (6)
2250879 LLE (7)
2248050 LLE (8)
2276658 LLE (9)
2180671 LLE (10)
2256087 LLE (11)
2280136 LLE (12)
2267316 LLE (15)
2288777 LLE (17)
2238674 LLE (20)
2239926 LLE (30)
2267471 LLE (40)
2228594 LLE (50)
2279204 LLE (100)
2289578 modLLE (12)
2289578 modLLE (15 & 15)
2271523 LTSA (15 & 15)
2263116 Hessian (8 & x)
2266886 MDS
2280313 SE
```

In [99]:
# Isomap on all-samples valid-columns.

XPV_ = sklearn.manifold.Isomap(n_components=15).fit(uXPV).transform(XPV)
PVcols_ = ['PV_Iso_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

['PV_Iso_0', 'PV_Iso_1', 'PV_Iso_2', 'PV_Iso_3', 'PV_Iso_4', 'PV_Iso_5', 'PV_Iso_6', 'PV_Iso_7', 'PV_Iso_8', 'PV_Iso_9', 'PV_Iso_10', 'PV_Iso_11', 'PV_Iso_12', 'PV_Iso_13', 'PV_Iso_14', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2451364.00 with PV_Iso_0
	Scored 2528784.11 with PV_Iso_1
	Scored 2486702.84 with PV_Iso_2
	Scored 2530463.53 with PV_Iso_3
	Scored 2490639.54 with PV_Iso_4
	Scored 2524713.84 with PV_Iso_5
	Scored 2501117.59 with PV_Iso_6
	Scored 2547334.72 with PV_Iso_7
	Scored 2517910.12 with PV_Iso_8
	Scored 2515056.15 with PV_Iso_9
	Scored 2533300.60 with PV_Iso_10
	Scored 2511687.47 with PV_Iso_11
	Scored 2528542.20 with PV_Iso_12
	Scored 2527639.85 with PV_Iso_13
	Scored 2520614.00 with PV_Iso_14
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feat

In [71]:
# LocallyLinearEmbedding on all-samples valid-columns.

n_components = 8 #XPV_.shape[1]
n_neighbors  = np.ceil(n_components*(n_components+3)/2) + 1
XPV_ = sklearn.manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, method='hessian', n_components=n_components, random_state=888).fit(uXPV).transform(XPV)
PVcols_ = ['PV_LLE_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

  return_distance=return_distance)
  return_distance=return_distance)
  return_distance=return_distance)
  Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float)
  return_distance=return_distance)


['PV_LLE_0', 'PV_LLE_1', 'PV_LLE_2', 'PV_LLE_3', 'PV_LLE_4', 'PV_LLE_5', 'PV_LLE_6', 'PV_LLE_7', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2518430.84 with PV_LLE_0
	Scored 2435163.74 with PV_LLE_1
	Scored 2492186.46 with PV_LLE_2
	Scored 2518494.50 with PV_LLE_3
	Scored 2520639.19 with PV_LLE_4
	Scored 2515481.31 with PV_LLE_5
	Scored 2512871.75 with PV_LLE_6
	Scored 2517983.06 with PV_LLE_7
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feature Age (scored 2383281.7835882343)
	Scored 2390890.22 with PV_LLE_0, Age
	Scored 2356449.30 with PV_LLE_1, Age
	Scored 2388695.28 with PV_LLE_2, Age
	Scored 2381396.53 with PV_LLE_3, Age
	Scored 2394315.08 with PV_LLE_4, Age
	Scored 2393374.73 with PV_LLE_5, Age
	Scored 2384584.88 with PV_LLE_6, Age
	Scored 2399166.69 with

  return_distance=return_distance)
  return_distance=return_distance)


In [72]:
# MDS on all-samples valid-columns.

n_components = XPV_.shape[1]
uXPV_ = sklearn.manifold.MDS(n_components=n_components, random_state=888).fit_transform(uXPV)
XPV_ = uXPV_[index_is_labelled,:]

print(XPV_.shape)

PVcols_ = ['PV_MDS_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

(166, 8)
['PV_LLE_0', 'PV_LLE_1', 'PV_LLE_2', 'PV_LLE_3', 'PV_LLE_4', 'PV_LLE_5', 'PV_LLE_6', 'PV_LLE_7', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2488050.45 with PV_LLE_0
	Scored 2503301.54 with PV_LLE_1
	Scored 2469528.02 with PV_LLE_2
	Scored 2521824.88 with PV_LLE_3
	Scored 2547669.67 with PV_LLE_4
	Scored 2517005.25 with PV_LLE_5
	Scored 2514124.87 with PV_LLE_6
	Scored 2501607.46 with PV_LLE_7
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feature Age (scored 2383281.7835882343)
	Scored 2361901.70 with PV_LLE_0, Age
	Scored 2386836.50 with PV_LLE_1, Age
	Scored 2375911.22 with PV_LLE_2, Age
	Scored 2392787.77 with PV_LLE_3, Age
	Scored 2416768.53 with PV_LLE_4, Age
	Scored 2398615.56 with PV_LLE_5, Age
	Scored 2400611.74 with PV_LLE_6, Age
	Scored 237987

In [73]:
# Spectral Embedding on all-samples valid-columns.

n_components = XPV_.shape[1]
XPV_ = sklearn.manifold.SpectralEmbedding(n_components=n_components, random_state=888).fit_transform(XPV)

print(XPV_.shape)

PVcols_ = ['PV_SE_'+str(i) for i in range(XPV_.shape[1])]

X_ = np.concatenate((XPV_, XG, XO), axis=1)
cols_ = PVcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y, col_names=cols_)

(166, 8)
['PV_SE_0', 'PV_SE_1', 'PV_SE_2', 'PV_SE_3', 'PV_SE_4', 'PV_SE_5', 'PV_SE_6', 'PV_SE_7', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 2517940.30 with PV_SE_0
	Scored 2461271.86 with PV_SE_1
	Scored 2529134.28 with PV_SE_2
	Scored 2493422.58 with PV_SE_3
	Scored 2521369.81 with PV_SE_4
	Scored 2511632.22 with PV_SE_5
	Scored 2512891.08 with PV_SE_6
	Scored 2538803.69 with PV_SE_7
	Scored 2383281.78 with Age
	Scored 2421061.88 with isBig
	Scored 2406686.31 with isIstanbul
	Scored 2524844.04 with isAnkara
	Scored 2531819.78 with isIzmir
	Scored 2509921.33 with isIL
	Scored 2524877.55 with missingSource
Adding feature Age (scored 2383281.7835882343)
	Scored 2390531.22 with PV_SE_0, Age
	Scored 2368623.60 with PV_SE_1, Age
	Scored 2398226.50 with PV_SE_2, Age
	Scored 2377296.58 with PV_SE_3, Age
	Scored 2393014.55 with PV_SE_4, Age
	Scored 2386632.63 with PV_SE_5, Age
	Scored 2397233.47 with PV_SE_6, Age
	Scored 2405772.63 with PV_SE_7, Age


## Some samples, all columns

In [82]:
# NMF on all-samples valid-columns. Fit to unlabelled.

u = uXPM
d = XPM[data_nomissing,:]
XPM_ = sklearn.decomposition.NMF(random_state=888, tol=0.000001, max_iter=1000).fit(u-np.min(u)).transform(d-np.min(u))
PMcols_ = ['PM_NMF_'+str(i) for i in range(XPM_.shape[1])]

X_ = np.concatenate((XPM_, XG[data_nomissing,:], XO[data_nomissing,:]), axis=1)
cols_ = PMcols_ + Gcols + Ocols

print(cols_)

clf = sklearn.linear_model.LinearRegression()
support, ranking = rfa(clf, X_, y[data_nomissing], col_names=cols_)

['PV_NMF_0', 'PV_NMF_1', 'PV_NMF_2', 'PV_NMF_3', 'PV_NMF_4', 'PV_NMF_5', 'PV_NMF_6', 'PV_NMF_7', 'PV_NMF_8', 'PV_NMF_9', 'PV_NMF_10', 'PV_NMF_11', 'PV_NMF_12', 'PV_NMF_13', 'PV_NMF_14', 'PV_NMF_15', 'PV_NMF_16', 'Age', 'isBig', 'isIstanbul', 'isAnkara', 'isIzmir', 'isIL', 'missingSource']
	Scored 1875025.89 with PV_NMF_0
	Scored 1900917.36 with PV_NMF_1
	Scored 1921614.97 with PV_NMF_2
	Scored 1761950.20 with PV_NMF_3
	Scored 1903028.42 with PV_NMF_4
	Scored 1913157.78 with PV_NMF_5
	Scored 1900058.82 with PV_NMF_6
	Scored 1887850.66 with PV_NMF_7
	Scored 1893762.37 with PV_NMF_8
	Scored 1847908.21 with PV_NMF_9
	Scored 1880974.87 with PV_NMF_10
	Scored 1906870.19 with PV_NMF_11
	Scored 1904857.86 with PV_NMF_12
	Scored 1924445.47 with PV_NMF_13
	Scored 1880788.89 with PV_NMF_14
	Scored 1913664.69 with PV_NMF_15
	Scored 1888259.35 with PV_NMF_16
	Scored 1892850.75 with Age
	Scored 1902543.06 with isBig
	Scored 1886346.47 with isIstanbul
	Scored 1802409.61 with isAnkara
	Scored 1925163.

In [85]:
XO[data_nomissing,:]

array([[ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  0