In [1]:
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import keras
from sklearn.dummy import DummyClassifier
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from numpy import array
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from pandas.plotting import lag_plot
from sklearn.feature_selection import SelectKBest, chi2

Using TensorFlow backend.


In [2]:
#Function for loading data!

def load_data():
    
    df = pickle.load(open('df_down_sampled.p','rb'))
    df = df.drop(['resp_1', 'resp_2','resp_3','resp_4'], axis = 1)
    df = df.sort_values(by = 'ts_id')
    df['y'] = 0
    mask = df.resp > 0
    df.loc[mask,'y'] = 1    
    
    return df

In [3]:
#Function for performing pca-transformation!

def perform_pca_transformation(X_train, X_test, number_of_components):
    
    col_list = []

    for x in range(number_of_components):
        
        col_list.append('pca_feature_' + str(x))
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    pca = PCA(n_components = number_of_components)
    pca.fit(X_train_scaled)
    
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    X_train_pca = pd.DataFrame(data = X_train_pca, columns = col_list)
    X_test_pca = pd.DataFrame(data = X_test_pca, columns = col_list)
    
    return (X_train_pca, X_test_pca)

In [4]:
#Function for splitting data into train/test set!

def train_test_split(test_share, data):
    
    #Split data into initial train/test
    
    train_share = 1 - test_share    
    train_size = int(len(data) * train_share)
    train_set = data[0:train_size]
    test_set = data[train_size:len(data)]    
    
    
    return (train_set, test_set)

In [5]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        
        col_type = df[col].dtype
        
        if col_type != 'object':
           
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:                    
                    df[col] = df[col].astype(np.int8)
                
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [6]:
#Load data!

df = load_data()


In [12]:
df.date.value_co

Unnamed: 0,date,weight,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,y
532070,87,0.271879,-0.039667,-1,1.137953,-1.099149,-1.205795,-0.401901,2.094101,0.697165,...,1.193421,-2.735373,0.940361,-4.081068,1.759097,-2.717352,1.576622,-2.414959,532070,0
532071,87,7.002739,-0.005065,-1,-0.653419,1.294904,0.405524,1.048848,0.005951,0.066053,...,-1.144605,1.898965,-1.084516,1.987009,-1.718251,1.591238,-1.292307,1.942232,532071,0
532072,87,0.171424,0.007406,1,-3.172026,-3.093182,-1.356817,-1.252079,4.313544,4.437772,...,1.476225,2.239128,1.025673,2.485322,1.700121,2.175138,2.038591,2.655379,532072,1
532073,87,4.019514,-0.005318,-1,-1.659606,-0.386442,0.426953,1.098562,0.019159,0.099820,...,-0.669236,3.561805,-1.076729,2.028143,-1.661257,1.724619,-1.105069,2.446877,532073,0
532074,87,0.598352,-0.016445,-1,-0.787562,-0.277811,-2.767844,-3.852492,3.324745,4.690789,...,0.173087,1.673256,0.360858,3.211130,0.308219,2.072384,0.192604,1.636265,532074,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390486,499,0.000000,0.015396,1,-1.649365,-1.169996,-0.889129,-1.256179,-0.265419,-0.383478,...,-1.260055,1.947725,-1.994399,-1.685163,-2.866165,-0.216130,-1.892048,0.901585,2390486,1
2390487,499,0.000000,-0.004718,1,2.432943,5.284504,-0.337469,-0.494263,-0.442409,-0.739016,...,1.064936,3.119762,-0.419796,-0.208975,-0.146749,0.730166,0.648452,2.068737,2390487,0
2390488,499,0.000000,0.016591,1,-0.622475,-0.963682,0.532835,0.392287,0.977046,0.819693,...,-0.640334,-2.279663,-0.950259,-4.388417,-1.669922,-3.288939,-1.336142,-2.814239,2390488,1
2390489,499,0.283405,-0.002004,-1,-1.463757,-1.107228,-2.286985,-3.156451,-1.690676,-2.348199,...,-1.780962,0.881246,-2.202140,-1.912601,-3.341684,-0.571188,-2.185795,0.627452,2390489,0


In [7]:
#Split into train/test!

train_set, test_set = train_test_split(test_share = 0.9, data = df)

In [8]:
train_set = reduce_memory_usage(train_set)

Memory usage of dataframe is 192.8292236328125 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Memory usage of dataframe after reduction 49.27070236206055 MB
Reduced by 74.44852941176471 % 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
train_set.shape

(185842, 135)

In [9]:
X_train = train_set.drop(['date','weight','y','ts_id', 'resp'], axis = 1)
X_test = test_set.drop(['date','weight','y','ts_id', 'resp'], axis = 1)

y_train = train_set.y
y_test = test_set.y

In [185]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [186]:
y_pred_train = model.predict_proba(X_train)[:,1]

In [187]:
train_roc = roc_auc_score(y_train, y_pred_train)

In [188]:
y_pred_test = model.predict_proba(X_test)[:,1]

In [189]:
test_roc = roc_auc_score(y_test, y_pred_test)

In [191]:
print("Train roc: {}, test roc: {}".format(train_roc, test_roc))

Train roc: 0.6514900351256979, test roc: 0.5187198773140467


In [132]:
(X_train_pca, X_test_pca) = perform_pca_transformation(X_train, X_test, 5)

  return self.partial_fit(X, y)
  del sys.path[0]
  


In [152]:
model = XGBClassifier()
model.fit(X_train_pca, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [153]:
y_pred_train = model.predict_proba(X_train_pca)[:,1]

In [154]:
y_pred_test = model.predict_proba(X_test_pca)[:,1]

In [155]:
train_roc = roc_auc_score(y_train, y_pred_train)

In [156]:
test_roc = roc_auc_score(y_test, y_pred_test)

In [158]:
print("The train roc is: {}, the test roc is: {}".format(train_roc, test_roc))

The train roc is: 0.6520538782115943, the test roc is: 0.5088760029429306


In [17]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [18]:
y_pred_train = model.predict_proba(X_train)[:,1]

In [19]:
train_roc = roc_auc_score(y_test, y_pred_test)

In [20]:
y_pred_test = model.predict_proba(X_test)[:,1]

In [21]:
test_roc = roc_auc_score(y_test, y_pred_test)

In [22]:
print("The train roc is: {}, the test roc is: {}".format(train_roc, test_roc))

The train roc is: 0.513151927437448, the test roc is: 0.513748571503123
