In [1]:
import os
import datetime as dt
import time
import json

import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn import metrics

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
pd.options.display.max_columns = 80
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)


In [3]:
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [4]:
with open('clean_data/train_cols.json', 'r') as f:
    train_cols = json.load(f)

In [5]:
train_cols.keys()
id_cols = train_cols['id']
cat_cols = train_cols['cat_cols']
int_cols = train_cols['int_cols']
target_col = train_cols['target_col']

ftr_cols = int_cols + cat_cols

## Agenda

    - Scale Data
    - KMeans and DBSCAN for Cluster Labels as features
    - Clustering On PCA/SVD/NMF
    - NaiveBayes Probabilities as Features
    - Polynomial Features

### Scale Data

In [107]:
mm = MinMaxScaler()
xtrain_scaled = mm.fit_transform(train[ftr_cols])
xtest_scaled = mm.transform(test[ftr_cols])
ytrain = train[target_col]

### KMeans

In [7]:
km_train_data = {}
km_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(xtrain_scaled)
    km_test = km.predict(xtest_scaled)
    
    km_train_data['K'+k] = km_train
    km_test_data['K'+k] = km_test
    
    
# make DFS
km_train_df = pd.DataFrame(km_train_data)
km_test_df = pd.DataFrame(km_test_df)

3
4
5
6
7
8
9
10
11
12
13
14


In [None]:
km_train_df.to_csv('clean_data/km_train_df.csv',index=False)
km_test_df.to_csv('clean_data/km_test_df.csv',index=False)

### DBSCAN

In [36]:
db_train_data = {}
db_test_data = {}
for eps in [0.1, 0.5, 1, 1.25]:
    for ms in range(5,50,10):
        print('EPS:',eps, 'MS:', ms )
        db = DBSCAN(eps,ms, n_jobs=-1)
        db_train = db.fit_predict(xtrain_scaled)
        db_test = db.predict(xtest_scaled)
        db_train_data['EPS'+eps+'_MS'+ms] = db_train
        db_test_data['EPS'+eps+'_MS'+ms] = db_test
        
# make DFS
db_train_df = pd.DataFrame(db_train_data)
db_test_df = pd.DataFrame(db_test_df)

EPS: 0.1
NCLASSES: 3
EPS: 0.30000000000000004
NCLASSES: 44
EPS: 0.5000000000000001
NCLASSES: 44
EPS: 0.7000000000000001
NCLASSES: 45
EPS: 0.9000000000000001
NCLASSES: 46


In [None]:
db_train_df.to_csv('clean_data/db_train_df.csv',index=False)
db_test_df.to_csv('clean_data/db_test_df.csv',index=False)

### SVD  & PCA

In [48]:
svd = svd(n_components = 10)

svd_train = svd.fit_transform(xtrain_scaled)
svd_test = svd.transform(xtest_scaled)

pca = PCA(n_components = 10)

pca_train = pca.fit_transform(xtrain_scaled)
pca_test = pca.transform(xtest_scaled)


# make dataframes
svd_train_df = pd.DataFrame(svd_train, columns = ['svd'+str(i) for i in range(svd_train.shape[1])])
svd_test_df = pd.DataFrame(svd_test, columns = ['svd'+str(i) for i in range(svd_train.shape[1])])

pca_train_df = pd.DataFrame(pca_train, columns = ['pca'+str(i) for i in range(svd_train.shape[1])])
pca_test_df = pd.DataFrame(pca_test, columns = ['pca'+str(i) for i in range(svd_train.shape[1])])

array([[0.05662173, 0.        , 0.03061152, ..., 0.        , 0.        ,
        0.04504292],
       [0.05749436, 0.        , 0.03165892, ..., 0.        , 0.        ,
        0.04294279],
       [0.03677829, 0.        , 0.16277515, ..., 0.01108842, 0.00330992,
        0.07532656],
       ...,
       [0.02187222, 0.00259417, 0.00087198, ..., 0.0083134 , 0.00683534,
        0.00086697],
       [0.03059828, 0.00143   , 0.        , ..., 0.00277686, 0.00414615,
        0.        ],
       [0.03644856, 0.        , 0.        , ..., 0.        , 0.        ,
        0.00073602]])

In [None]:
svd_train_df.to_csv('clean_data/svd_train_df.csv',index=False)
svd_test_df.to_csv('clean_data/svd_test_df.csv',index=False)
pca_train_df.to_csv('clean_data/pca_train_df.csv',index=False)
pca_test_df.to_csv('clean_data/pca_test_df.csv',index=False)

### Cluster On Transformed Matrices

In [None]:
km_svd_train_data = {}
km_svd_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(svd_train)
    km_test = km.predict(svd_test)
    
    km_svd_train_data['SVD_K'+k] = km_train
    km_svd_test_data['SVD_K'+k] = km_test
    
km_pca_train_data = {}
km_pca_test_data = {}
for k in range(3,15):
    print(k)
    km = KMeans(k)
    km_train = km.fit_predict(pca_train)
    km_test = km.predict(pca_test)
    
    km_pca_train_data['PCA_K'+k] = km_train
    km_pca_test_data['PCA_K'+k] = km_test
    

# Make DFS
km_svd_train_df = pd.DataFrame(km_svd_train_data)
km_svd_test_df = pd.DataFrame(km_svd_test_data)

km_pca_train_df = pd.DataFrame(km_pca_train_data)
km_pca_test_df = pd.DataFrame(km_pca_test_data)



In [None]:
km_svd_train_df.to_csv('clean_data/km_svd_train_df.csv',index=False)
km_svd_test_df.to_csv('clean_data/km_svd_test_df.csv',index=False)
km_pca_train_df.to_csv('clean_data/km_pca_train_df.csv',index=False)
km_pca_test_df.to_csv('clean_data/km_pca_test_df.csv',index=False)

In [None]:
db_svd_train_data = {}
db_svd_test_data = {}
for eps in [0.1, 0.5, 1, 1.25]:
    for ms in range(5,50,10):
        print('EPS:',eps, 'MS:', ms )
        db = DBSCAN(eps,ms, n_jobs=-1)
        db_train = db.fit_predict(svd_train)
        db_test = db.predict(svd_test)
        
        db_svd_train_data['SVD_EPS'+eps+'_MS'+ms] = db_train
        db_svd_test_data['SVD_EPS'+eps+'_MS'+ms] = db_test
        
db_pca_train_data = {}
db_pca_test_data = {}
for eps in [0.1, 0.5, 1, 1.25]:
    for ms in range(5,50,10):
        print('EPS:',eps, 'MS:', ms )
        db = DBSCAN(eps,ms, n_jobs=-1)
        db_train = db.fit_predict(pca_train)
        db_test = db.predict(pca_test)
        
        db_pca_train_data['PCA_EPS'+eps+'_MS'+ms] = db_train
        db_pca_test_data['PCA_EPS'+eps+'_MS'+ms] = db_test
        

# Make DFS
db_svd_train_df = pd.DataFrame(db_svd_train_data)
db_svd_test_df = pd.DataFrame(db_svd_test_data)

db_pca_train_df = pd.DataFrame(db_pca_train_data)
db_pca_test_df = pd.DataFrame(db_pca_test_data)


In [None]:
db_svd_train_df.to_csv('clean_data/db_svd_train_df.csv',index=False)
db_svd_test_df.to_csv('clean_data/db_svd_test_df.csv',index=False)
db_pca_train_df.to_csv('clean_data/db_pca_train_df.csv',index=False)
db_pca_test_df.to_csv('clean_data/db_pca_test_df.csv',index=False)

### Naive Bayes

In [189]:
class NaiveBayesClf():
    """
    Uses Bernoulli NB for Binary Features,
    Multinomial NB for integer ftrs,
    Gaussian NB for all other ftrs.
    
    Final Probs is average of 3 predicted probabilities of above models.
    """
    
    def __init__(self, bparams={}, mparams={}, gparams={}):
        self.bnb = BernoulliNB(**bparams)
        self.mnb = MultinomialNB(**mparams)
        self.gnb = GaussianNB(**gparams)
        
        
    def get_cols(self, data):
        bern_cols = data.columns[data.apply(lambda col: col.nunique() == 2)].tolist()
        mult_cols = data.columns[(data.dtypes==int) & ~data.columns.isin(bern_cols)].tolist()
        gaus_cols = data.columns[~data.columns.isin(bern_cols+mult_cols)]
        assert len(bern_cols) + len(mult_cols) + len(gaus_cols) == data.shape[1]
        
        self.bern_locs = [data.columns.get_loc(b) for b in bern_cols]
        self.mult_locs = [data.columns.get_loc(m) for m in mult_cols]
        self.gaus_locs = [data.columns.get_loc(g) for g in gaus_cols]
        
    
    def fit(self, data, target):
        self.get_cols(data)
        if self.bern_locs:
            self.bnb.fit(data.values[:,self.bern_locs], target)
        if self.mult_locs:
            self.mnb.fit(data.values[:,self.mult_locs], target)
        if self.gaus_locs:
            self.gnb.fit(data.values[:,self.gaus_locs], target)

    
    def predict(self, new_data):
        all_probs = []
        if self.bern_locs:
            bprobs = self.bnb.predict_proba(new_data[:,self.bern_locs])
            all_probs.append(bprobs)
        if self.mult_locs:
            mprobs = self.mnb.predict_proba(new_data[:,self.mult_locs])
            all_probs.append(mprobs)
        if self.gaus_locs:
            gprobs = self.gnb.predict_proba(new_data[:,self.gaus_locs])
            all_probs.append(gprobs)
        all_probs = np.asarray(all_p)
        final_probs = all_probs.mean(0)

        return final_probs


In [212]:
xtrain_scaled_df = pd.DataFrame(xtrain_scaled, columns = ftr_cols)
xtest_scaled_df = pd.DataFrame(xtest_scaled, columns = ftr_cols)
xtrain_scaled_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,0.37,0.14,0.06,0.19,0.21,0.07,0.87,0.86,0.6,0.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.37,0.16,0.04,0.16,0.2,0.06,0.87,0.88,0.61,0.89,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.47,0.39,0.17,0.2,0.3,0.46,0.92,0.9,0.54,0.88,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.46,0.43,0.35,0.18,0.38,0.45,0.94,0.9,0.49,0.89,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.37,0.12,0.04,0.11,0.21,0.06,0.87,0.87,0.6,0.88,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
nbc = NaiveBayesClf()
nbc.fit(xtrain_scaled_df, train[target_col])
nb_train = nbc.predict(xtrain_scaled_df.values)
nb_test = nbc.predict(xtest_scaled_df.values)

nb_train_df = pd.DataFrame(nb_train, columns = ['nb_prob_'+str(i) for i in range(nb_probs.shape[1])])
nb_test_df = pd.DataFrame(nb_test, columns = ['nb_prob_'+str(i) for i in range(nb_probs.shape[1])])

In [196]:
metrics.accuracy_score(ytrain.values, probs.argmax(1)+1)

0.6579365079365079

### Combine All New Ftrs

In [None]:
all_train_ftrs = [
    xtrain_scaled_df,
    km_train_df,
    db_train_df,
    svd_train_df,
    pca_train_df,
    km_svd_train_df,
    km_pca_train_df,
    db_svd_train_df,
    db_pca_train_df,
    nb_train_df
]
    
all_test_ftrs = [ 
    xtest_scaled_df
    km_test_df,
    db_test_df,
    svd_test_df,
    pca_test_df,
    km_svd_test_df,
    km_pca_test_df,
    db_svd_test_df,
    db_pca_test_df,
    nb_test_df
]

all_train_df = pd.concat(all_train_ftrs, axis=1)
all_test_df = pd.concat(all_test_ftrs, axis=1)

all_train_df.shape, all_test_df.shape

In [None]:
all_train_df.to_csv('all_train_df.csv', index=False)
all_test_df.to_csv('all_test_df.csv', index=False)