In [2]:
import os
import datetime as dt
import time
import json

import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn import metrics

import lightgbm as lgb
from lightgbm import LGBMClassifier

from feature_selector import *

  from numpy.core.umath_tests import inner1d


In [3]:
pd.options.display.max_columns = 80
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)


In [4]:
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [5]:
with open('clean_data/train_cols.json', 'r') as f:
    train_cols = json.load(f)

In [6]:
train_cols.keys()
id_cols = train_cols['id']
cat_cols = train_cols['cat_cols']
int_cols = train_cols['int_cols']
target_col = train_cols['target_col']

ftr_cols = int_cols + cat_cols

In [7]:
ytrain = train[target_col]

## Agenda

    - Scale Data
    - KMeans and DBSCAN for Cluster Labels as features
    - Clustering On PCA/SVD/NMF
    - NaiveBayes Probabilities as Features
    - Run Through Feature Selector
    - Polynomial Features
    - Feature Selector Again

### Scale Data

In [None]:
mm = MinMaxScaler()
xtrain_scaled = mm.fit_transform(train[ftr_cols])
xtest_scaled = mm.transform(test[ftr_cols])
ytrain = train[target_col]

In [None]:
pd.DataFrame(xtrain_scaled, columns = ftr_cols).to_csv('clean_data/xtrain_scaled.csv',index=False)
pd.DataFrame(xtest_scaled, columns=ftr_cols).to_csv('clean_data/xtest_scaled.csv',index=False)

### KMeans

In [None]:
km_train_data = {}
km_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(xtrain_scaled)
    km_test = km.predict(xtest_scaled)
    
    km_train_data['K'+str(k)] = km_train
    km_test_data['K'+str(k)] = km_test
    
    
# make DFS
km_train_df = pd.DataFrame(km_train_data)
km_test_df = pd.DataFrame(km_test_data)

In [None]:
km_train_df.to_csv('clean_data/km_train_df.csv',index=False)
km_test_df.to_csv('clean_data/km_test_df.csv',index=False)

### DBSCAN

DB has no transform method so we can't use it on new data

TODO: write predict method for db that just takes distances from cluster centers/avgs

In [None]:
# db_train_data = {}
# db_test_data = {}
# for eps in [0.1, 0.5, 1, 1.25]:
#     for ms in range(5,50,10):
#         print('EPS:',eps, 'MS:', ms )
#         db = DBSCAN(eps,ms, n_jobs=-1)
#         db_train = db.fit_predict(xtrain_scaled)
#         db_test = db.predict(xtest_scaled)
#         db_train_data['EPS'+str(eps)+'_MS'+str(ms)] = db_train
#         db_test_data['EPS'+str(eps)+'_MS'+str(ms)] = db_test
        
# # make DFS
# db_train_df = pd.DataFrame(db_train_data)
# db_test_df = pd.DataFrame(db_test_df)

In [None]:
# db_train_df.to_csv('clean_data/db_train_df.csv',index=False)
# db_test_df.to_csv('clean_data/db_test_df.csv',index=False)

### SVD  & PCA

In [None]:
svd = TruncatedSVD(n_components = 10)

svd_train = svd.fit_transform(xtrain_scaled)
svd_test = svd.transform(xtest_scaled)

pca = PCA(n_components = 10)

pca_train = pca.fit_transform(xtrain_scaled)
pca_test = pca.transform(xtest_scaled)


# make dataframes
svd_train_df = pd.DataFrame(svd_train, columns = ['svd'+str(i) for i in range(svd_train.shape[1])])
svd_test_df = pd.DataFrame(svd_test, columns = ['svd'+str(i) for i in range(svd_train.shape[1])])

pca_train_df = pd.DataFrame(pca_train, columns = ['pca'+str(i) for i in range(svd_train.shape[1])])
pca_test_df = pd.DataFrame(pca_test, columns = ['pca'+str(i) for i in range(svd_train.shape[1])])

In [None]:
svd_train_df.to_csv('clean_data/svd_train_df.csv',index=False)
svd_test_df.to_csv('clean_data/svd_test_df.csv',index=False)
pca_train_df.to_csv('clean_data/pca_train_df.csv',index=False)
pca_test_df.to_csv('clean_data/pca_test_df.csv',index=False)

### Cluster On Transformed Matrices

In [None]:
km_svd_train_data = {}
km_svd_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(svd_train)
    km_test = km.predict(svd_test)
    
    km_svd_train_data['SVD_K'+str(k)] = km_train
    km_svd_test_data['SVD_K'+str(k)] = km_test
    
km_pca_train_data = {}
km_pca_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(pca_train)
    km_test = km.predict(pca_test)
    
    km_pca_train_data['PCA_K'+str(k)] = km_train
    km_pca_test_data['PCA_K'+str(k)] = km_test
    

# Make DFS
km_svd_train_df = pd.DataFrame(km_svd_train_data)
km_svd_test_df = pd.DataFrame(km_svd_test_data)

km_pca_train_df = pd.DataFrame(km_pca_train_data)
km_pca_test_df = pd.DataFrame(km_pca_test_data)



In [None]:
km_svd_train_df.to_csv('clean_data/km_svd_train_df.csv',index=False)
km_svd_test_df.to_csv('clean_data/km_svd_test_df.csv',index=False)
km_pca_train_df.to_csv('clean_data/km_pca_train_df.csv',index=False)
km_pca_test_df.to_csv('clean_data/km_pca_test_df.csv',index=False)

In [None]:
# db_svd_train_data = {}
# db_svd_test_data = {}
# for eps in [0.1, 0.5, 1, 1.25]:
#     for ms in range(5,50,10):
#         print('EPS:',eps, 'MS:', ms )
#         db = DBSCAN(eps,ms, n_jobs=-1)
#         db_train = db.fit_predict(svd_train)
#         db_test = db.predict(svd_test)
        
#         db_svd_train_data['SVD_EPS'+str(eps)+'_MS'+str(ms)] = db_train
#         db_svd_test_data['SVD_EPS'+str(eps)+'_MS'+str(ms)] = db_test
        
# db_pca_train_data = {}
# db_pca_test_data = {}
# for eps in [0.1, 0.5, 1, 1.25]:
#     for ms in range(5,50,10):
#         print('EPS:',eps, 'MS:', ms )
#         db = DBSCAN(eps,ms, n_jobs=-1)
#         db_train = db.fit_predict(pca_train)
#         db_test = db.predict(pca_test)
        
#         db_pca_train_data['PCA_EPS'+str(eps)+'_MS'+str(ms)] = db_train
#         db_pca_test_data['PCA_EPS'+str(eps)+'_MS'+str(ms)] = db_test
        

# # Make DFS
# db_svd_train_df = pd.DataFrame(db_svd_train_data)
# db_svd_test_df = pd.DataFrame(db_svd_test_data)

# db_pca_train_df = pd.DataFrame(db_pca_train_data)
# db_pca_test_df = pd.DataFrame(db_pca_test_data)


In [None]:
# db_svd_train_df.to_csv('clean_data/db_svd_train_df.csv',index=False)
# db_svd_test_df.to_csv('clean_data/db_svd_test_df.csv',index=False)
# db_pca_train_df.to_csv('clean_data/db_pca_train_df.csv',index=False)
# db_pca_test_df.to_csv('clean_data/db_pca_test_df.csv',index=False)

### Naive Bayes

In [None]:
class NaiveBayesClf():
    """
    Uses Bernoulli NB for Binary Features,
    Multinomial NB for integer ftrs,
    Gaussian NB for all other ftrs.
    
    Final Probs is average of 3 predicted probabilities of above models.
    """
    
    def __init__(self, bparams={}, mparams={}, gparams={}):
        self.bnb = BernoulliNB(**bparams)
        self.mnb = MultinomialNB(**mparams)
        self.gnb = GaussianNB(**gparams)
        
        
    def get_cols(self, data):
        bern_cols = data.columns[data.apply(lambda col: col.nunique() == 2)].tolist()
        mult_cols = data.columns[(data.dtypes==int) & ~data.columns.isin(bern_cols)].tolist()
        gaus_cols = data.columns[~data.columns.isin(bern_cols+mult_cols)]
        assert len(bern_cols) + len(mult_cols) + len(gaus_cols) == data.shape[1]
        
        self.bern_locs = [data.columns.get_loc(b) for b in bern_cols]
        self.mult_locs = [data.columns.get_loc(m) for m in mult_cols]
        self.gaus_locs = [data.columns.get_loc(g) for g in gaus_cols]
        
    
    def fit(self, data, target):
        self.get_cols(data)
        if self.bern_locs:
            self.bnb.fit(data.values[:,self.bern_locs], target)
        if self.mult_locs:
            self.mnb.fit(data.values[:,self.mult_locs], target)
        if self.gaus_locs:
            self.gnb.fit(data.values[:,self.gaus_locs], target)

    
    def predict(self, new_data):
        all_probs = []
        if self.bern_locs:
            bprobs = self.bnb.predict_proba(new_data[:,self.bern_locs])
            all_probs.append(bprobs)
        if self.mult_locs:
            mprobs = self.mnb.predict_proba(new_data[:,self.mult_locs])
            all_probs.append(mprobs)
        if self.gaus_locs:
            gprobs = self.gnb.predict_proba(new_data[:,self.gaus_locs])
            all_probs.append(gprobs)
        all_probs = np.asarray(all_probs)
        final_probs = all_probs.mean(0)

        return final_probs


In [None]:
xtrain_scaled_df = pd.DataFrame(xtrain_scaled, columns = ftr_cols)
xtest_scaled_df = pd.DataFrame(xtest_scaled, columns = ftr_cols)
xtrain_scaled_df.head()

In [None]:
nbc = NaiveBayesClf()
nbc.fit(xtrain_scaled_df, train[target_col])
nb_train = nbc.predict(xtrain_scaled_df.values)
nb_test = nbc.predict(xtest_scaled_df.values)

nb_train_df = pd.DataFrame(nb_train, columns = ['nb_prob_'+str(i) for i in range(nb_train.shape[1])])
nb_test_df = pd.DataFrame(nb_test, columns = ['nb_prob_'+str(i) for i in range(nb_test.shape[1])])

In [None]:
nb_train_df.to_csv('clean_data/nb_train_df.csv', index=False)
nb_test_df.to_csv('clean_data/nb_test_df.csv', index=False)

In [None]:
metrics.accuracy_score(ytrain.values, nb_train.argmax(1)+1)

### Combine All New Ftrs

In [None]:
import pandas as pd

In [None]:
# xtrain_scaled_df 
# xtest_scaled_df

km_train_df = pd.read_csv('clean_data/km_train_df.csv')
svd_train_df = pd.read_csv('clean_data/svd_train_df.csv')
pca_train_df = pd.read_csv('clean_data/pca_train_df.csv')
km_svd_train_df = pd.read_csv('clean_data/km_svd_train_df.csv')
km_pca_train_df = pd.read_csv('clean_data/km_pca_train_df.csv')
nb_train_df = pd.read_csv('clean_data/nb_train_df.csv')


km_test_df = pd.read_csv('clean_data/km_test_df.csv')
svd_test_df = pd.read_csv('clean_data/svd_test_df.csv')
pca_test_df = pd.read_csv('clean_data/pca_test_df.csv')
km_svd_test_df = pd.read_csv('clean_data/km_svd_test_df.csv')
km_pca_test_df = pd.read_csv('clean_data/km_pca_test_df.csv')
nb_test_df = pd.read_csv('clean_data/nb_test_df.csv')

In [None]:
all_train_ftrs = [
    xtrain_scaled_df,
    km_train_df,
#     db_train_df,
    svd_train_df,
    pca_train_df,
    km_svd_train_df,
    km_pca_train_df,
#     db_svd_train_df,
#     db_pca_train_df,
    nb_train_df
]
    
all_test_ftrs = [ 
    xtest_scaled_df,
    km_test_df,
#     db_test_df,
    svd_test_df,
    pca_test_df,
    km_svd_test_df,
    km_pca_test_df,
#     db_svd_test_df,
#     db_pca_test_df,
    nb_test_df
]

all_train_df = pd.concat(all_train_ftrs, axis=1)
all_test_df = pd.concat(all_test_ftrs, axis=1)

all_train_df.shape, all_test_df.shape

In [None]:
all_train_df.to_csv('clean_data/all_train_df.csv', index=False)
all_test_df.to_csv('clean_data/all_test_df.csv', index=False)

### Feature Selector

In [None]:
all_train_df = pd.read_csv('clean_data/all_train_df.csv')
all_test_df = pd.read_csv('clean_data/all_test_df.csv')

In [None]:
N=35
seed =1111

In [None]:
rf_params = {'n_estimators': all_train_df.shape[1]-1, 'max_features':N, 'n_jobs':-1, 'random_state':seed}
gb_params = {'n_estimators': all_train_df.shape[1]-1, 'random_state':seed, 'max_depth':10}

In [None]:
usecols = run_ftr_selection(all_train_df, ytrain, N, rf_params, gb_params)
usecols

### Polynomial Features

In [None]:
x_subset = all_train_df[usecols.index]
x_subset.shape

In [None]:
poly = PolynomialFeatures(3,include_bias=False)
train_poly = pd.DataFrame(poly.fit_transform(x_subset), columns = poly.get_feature_names())

print(train_poly.shape)
train_poly.to_csv('clean_data/train_poly_all.csv',index=False)

In [8]:
train_poly = pd.read_csv('clean_data/train_poly_all.csv')
train_poly.shape

(15120, 8435)

In [9]:
N_final = train.shape[1]
N_final = 200

In [None]:
new_use_cols = run_ftr_selection(train_poly, ytrain, N_final, rf_params, skip='rfe_gb')

In [None]:
train_poly_final = train_poly[new_use_cols.index]
train_poly_final.to_csv('clean_data/train_poly_final.csv',index=False)

Base Test

In [10]:
l = LGBMClassifier()

In [None]:
cvs = []
for c in range(3, 10):
    acc = np.mean(cross_val_score(l, train[ftr_cols], ytrain, cv = c, scoring='accuracy'))
    cvs.append(acc)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(range(3,10),cvs)

In [None]:
l.fit(all_train_df, ytrain)
preds = l.predict(all_test_df)

In [None]:
pd.DataFrame({'Id': test[id_cols].values, 'Cover_Type':preds}).to_csv('Submissions/base_lgbm.csv',index=False)