In [1]:
import pandas as pd
import numpy as np
import random
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.cluster import FeatureAgglomeration

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet, Ridge

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
import pickle

random.seed(1729)




In [7]:
train= pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')

In [10]:
train = train.loc[train['y'] < 170]

In [12]:
y_train = train['y']
train.drop('y',axis=1,inplace=True)

In [13]:
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [14]:
n_comp = 10

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train)
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train)
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train)
srp_results_test = srp.transform(test)

# NMF
nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
nmf_results_train = nmf.fit_transform(train)
nmf_results_test = nmf.transform(test)

# FAG
fag = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
fag_results_train = fag.fit_transform(train)
fag_results_test = fag.transform(test)

In [15]:
dim_reds = list()

train_ica = pd.DataFrame()
test_ica = pd.DataFrame()

train_grp = pd.DataFrame()
test_grp = pd.DataFrame()

train_srp = pd.DataFrame()
test_srp = pd.DataFrame()

train_nmf = pd.DataFrame()
test_nmf = pd.DataFrame()

train_fag = pd.DataFrame()
test_fag = pd.DataFrame()

for i in range(1, n_comp + 1):

    train_ica['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test_ica['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train_grp['grp_' + str(i)] = grp_results_train[:, i - 1]
    test_grp['grp_' + str(i)] = grp_results_test[:, i - 1]

    train_srp['srp_' + str(i)] = srp_results_train[:, i - 1]
    test_srp['srp_' + str(i)] = srp_results_test[:, i - 1]
    
    train_nmf['nmf_' + str(i)] = nmf_results_train[:, i - 1]
    test_nmf['nmf_' + str(i)] = nmf_results_test[:, i - 1]
    
    train_fag['fag_' + str(i)] = fag_results_train[:, i - 1]
    test_fag['fag_' + str(i)] = fag_results_test[:, i - 1]
    
dim_reds.append(('ica', train_ica, test_ica))
dim_reds.append(('grp', train_grp, test_grp))
dim_reds.append(('srp', train_srp, test_srp))
dim_reds.append(('nmf', train_nmf, test_nmf))
dim_reds.append(('fag', train_fag, test_fag))

In [28]:
train_ica.to_csv("../features/test_ica.csv",index=False)
test_ica.to_csv('../features/test_ica.csv',index=False)

train_grp.to_csv('../features/train_grp.csv',index=False)
test_grp.to_csv('../features/test_grp.csv',index=False)

train_srp.to_csv('../features/train_srp.csv',index=False)
test_srp.to_csv('../features/test_srp.csv',index=False)

train_nmf.to_csv('../features/train_nmf.csv',index=False)
test_nmf.to_csv('../features/test_nmf.csv',index=False)

train_fag.to_csv('../features/train_fag.csv',index=False)
test_fag.to_csv('../features/test_fag.csv',index=False)
