In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import( accuracy_score,
                           confusion_matrix,
                           classification_report,
                          roc_auc_score,
                          roc_curve,
                          auc,
                          plot_confusion_matrix,
                          plot_roc_curve)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


from tensorflow.keras.metrics import AUC
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

pd.set_option ('display.float','{:0.2f}'.format)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',50)

In [19]:
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,TYPE_OF_VEHICLE
0,816393,40-64,female,20-29y,university,middle class,0.64,0,after 2015,0,0,37379,11000,0,0,0,0,Sports Car
1,251762,26-39,male,20-29y,high school,middle class,0.48,1,before 2015,1,0,10238,9000,0,0,0,1,HatchBack
2,481952,40-64,male,20-29y,none,middle class,0.84,1,before 2015,1,1,10238,12000,0,0,0,1,Sedan
3,3506,40-64,male,20-29y,high school,upper class,0.68,1,before 2015,0,1,92099,6000,1,0,0,1,Sedan
4,498013,40-64,female,20-29y,none,working class,0.57,1,after 2015,1,1,32122,15000,0,0,1,0,Sedan


In [20]:
test=pd.read_csv('test.csv')
test.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,TYPE_OF_VEHICLE
0,303713,16-25,male,10-19y,high school,middle class,0.42,1,before 2015,1,0,10238,13000,0,0,0,Sports Car
1,141107,16-25,male,20-29y,high school,working class,0.5,0,before 2015,1,1,10238,14000,0,0,0,SUV
2,447316,26-39,female,20-29y,high school,upper class,0.25,1,before 2015,1,0,10238,10000,0,0,0,HatchBack
3,196066,16-25,female,20-29y,university,middle class,0.46,1,before 2015,1,1,10238,7000,0,1,0,SUV
4,179947,16-25,male,20-29y,high school,working class,0.45,1,after 2015,0,1,10238,13000,1,0,0,SUV


## Data Pre Processing

In [21]:
train['US_STATE'] = train['POSTAL_CODE'].apply(
    lambda x: 
    'AL' if x>=35004 and x<=36925 else
    'AK' if x>=99501 and x<=99950 else
    'AZ' if x>=85001 and x<=86556 else
    'AR' if x>=71601 and x<=72959 else
    'CA' if x>=90001 and x<=96162 else
    'CO' if x>=80001 and x<=81658 else
    'CT' if x>=6001  and x<=6928  else
    'DE' if x>=19701 and x<=19980 else
    'FL' if x>=32003 and x<=34997 else
    'GA' if x>=30002 and x<=39901 else
    'HI' if x>=96701 and x<=96898 else
    'ID' if x>=83201 and x<=83877 else
    'IL' if x>=60001 and x<=62999 else
    'IN' if x>=46001 and x<=47997 else
    'IA' if x>=50001 and x<=52809 else
    'KS' if x>=66002 and x<=67954 else
    'KY' if x>=40003 and x<=42788 else
    'LA' if x>=70001 and x<=71497 else
    'ME' if x>=3901  and x<=4992  else
    'MD' if x>=20588 and x<=21930 else
    'MA' if x>=1001  and x<=5544  else
    'MI' if x>=48001 and x<=49971 else
    'MN' if x>=55001 and x<=56763 else
    'MS' if x>=38601 and x<=39776 else
    'MO' if x>=63001 and x<=65899 else
    'MT' if x>=59001 and x<=59937 else
    'NE' if x>=68001 and x<=69367 else
    'NV' if x>=88901 and x<=89883 else
    'NH' if x>=3031  and x<=3897  else
    'NJ' if x>=7001  and x<=8989  else
    'NM' if x>=87001 and x<=88439 else
    'NY' if x>=501   and x<=14925 else
    'NC' if x>=27006 and x<=28909 else
    'ND' if x>=58001 and x<=58856 else
    'OH' if x>=43001 and x<=45999 else
    'OK' if x>=73001 and x<=74966 else
    'OR' if x>=97001 and x<=97920 else
    'PA' if x>=15001 and x<=19640 else
    'RI' if x>=2801  and x<=2940  else
    'SC' if x>=29001 and x<=29945 else
    'SD' if x>=57001 and x<=57799 else
    'TN' if x>=37010 and x<=38589 else
    'TX' if x>=73301 and x<=88595 else
    'UT' if x>=84001 and x<=84791 else
    'VT' if x>=5001  and x<=5907  else
    'VA' if x>=20101 and x<=24658 else
    'WA' if x>=98001 and x<=99403 else
    'WV' if x>=24701 and x<=26886 else
    'WI' if x>=53001 and x<=54990 else
    'WY' if x>=82001 and x<=83414 else
    'Not Available'
)

In [22]:
test['US_STATE'] = test['POSTAL_CODE'].apply(
    lambda x: 
    'AL' if x>=35004 and x<=36925 else
    'AK' if x>=99501 and x<=99950 else
    'AZ' if x>=85001 and x<=86556 else
    'AR' if x>=71601 and x<=72959 else
    'CA' if x>=90001 and x<=96162 else
    'CO' if x>=80001 and x<=81658 else
    'CT' if x>=6001  and x<=6928  else
    'DE' if x>=19701 and x<=19980 else
    'FL' if x>=32003 and x<=34997 else
    'GA' if x>=30002 and x<=39901 else
    'HI' if x>=96701 and x<=96898 else
    'ID' if x>=83201 and x<=83877 else
    'IL' if x>=60001 and x<=62999 else
    'IN' if x>=46001 and x<=47997 else
    'IA' if x>=50001 and x<=52809 else
    'KS' if x>=66002 and x<=67954 else
    'KY' if x>=40003 and x<=42788 else
    'LA' if x>=70001 and x<=71497 else
    'ME' if x>=3901  and x<=4992  else
    'MD' if x>=20588 and x<=21930 else
    'MA' if x>=1001  and x<=5544  else
    'MI' if x>=48001 and x<=49971 else
    'MN' if x>=55001 and x<=56763 else
    'MS' if x>=38601 and x<=39776 else
    'MO' if x>=63001 and x<=65899 else
    'MT' if x>=59001 and x<=59937 else
    'NE' if x>=68001 and x<=69367 else
    'NV' if x>=88901 and x<=89883 else
    'NH' if x>=3031  and x<=3897  else
    'NJ' if x>=7001  and x<=8989  else
    'NM' if x>=87001 and x<=88439 else
    'NY' if x>=501   and x<=14925 else
    'NC' if x>=27006 and x<=28909 else
    'ND' if x>=58001 and x<=58856 else
    'OH' if x>=43001 and x<=45999 else
    'OK' if x>=73001 and x<=74966 else
    'OR' if x>=97001 and x<=97920 else
    'PA' if x>=15001 and x<=19640 else
    'RI' if x>=2801  and x<=2940  else
    'SC' if x>=29001 and x<=29945 else
    'SD' if x>=57001 and x<=57799 else
    'TN' if x>=37010 and x<=38589 else
    'TX' if x>=73301 and x<=88595 else
    'UT' if x>=84001 and x<=84791 else
    'VT' if x>=5001  and x<=5907  else
    'VA' if x>=20101 and x<=24658 else
    'WA' if x>=98001 and x<=99403 else
    'WV' if x>=24701 and x<=26886 else
    'WI' if x>=53001 and x<=54990 else
    'WY' if x>=82001 and x<=83414 else
    'Not Available'
)

In [23]:
def preprocess_data(df):
    df['AGE'] = df['AGE'].map({
        '16-25': 0, 
        '26-39': 1, 
        '40-64': 2, 
        '65+': 3
    })
    
    df['GENDER'] = df['GENDER'].map({
        'male': 0, 
        'female': 1
    })
    
    df['DRIVING_EXPERIENCE'] = df['DRIVING_EXPERIENCE'].map({
        '0-9y': 0, 
        '10-19y': 1, 
        '20-29y': 2,
        '30y+': 3
    })
    
    df['EDUCATION'] = df['EDUCATION'].map({
        'none': 0, 
        'high school': 1, 
        'university': 2
    })
    
    df['INCOME'] = df['INCOME'].map({
        'poverty': 0, 
        'working class': 1, 
        'middle class': 2, 
        'upper class': 3
    })
    
    df['VEHICLE_YEAR'] = df['VEHICLE_YEAR'].map({
        'before 2015': 0, 
        'after 2015': 1
    })
    
    df['POSTAL_CODE0'] = df['POSTAL_CODE'].apply(lambda x: int(str(x)[0]))
    df['POSTAL_CODE1'] = df['POSTAL_CODE'].apply(lambda x: int(str(x)[1]))
    df['POSTAL_CODE2'] = df['POSTAL_CODE'].apply(lambda x: int(str(x)[2]))
    df['POSTAL_CODE3'] = df['POSTAL_CODE'].apply(lambda x: int(str(x)[3]))
    df['POSTAL_CODE4'] = df['POSTAL_CODE'].apply(lambda x: int(str(x)[4]))
    
    df['TYPE_OF_VEHICLE'] = df['TYPE_OF_VEHICLE'].map({
        'SUV': 0, 
        'Sedan': 1, 
        'HatchBack': 2, 
        'Sports Car': 3
    })
    
    df['US_STATE'] = df['US_STATE'].map({
        'AL': 1,
        'AK': 2,
        'AZ': 3,
        'AR': 4,
        'CA': 5,
        'CO': 6,
        'CT': 7,
        'DE': 8,
        'FL': 9,
        'GA': 10,
        'HI': 11,
        'ID': 12,
        'IL': 13,
        'IN': 14,
        'IA': 15,
        'KS': 16,
        'KY': 17,
        'LA': 18,
        'ME': 19,
        'MD': 20,
        'MA': 21,
        'MI': 22,
        'MN': 23,
        'MS': 24,
        'MO': 25,
        'MT': 26,
        'NE': 27,
        'NV': 28,
        'NH': 29,
        'NJ': 30,
        'NM': 31,
        'NY': 32,
        'NC': 33,
        'ND': 34,
        'OH': 35,
        'OK': 36,
        'OR': 37,
        'PA': 38,
        'RI': 39,
        'SC': 40,
        'SD': 41,
        'TN': 42,
        'TX': 43,
        'UT': 44,
        'VT': 45,
        'VA': 46,
        'WA': 47,
        'WV': 48,
        'WI': 49,
        'WY': 50,
        'Not Available': 0
    })
    
    df.drop(['POSTAL_CODE'], 
            axis=1, inplace=True)
    
    return df

In [24]:
train = preprocess_data(train)
test = preprocess_data(test)

# Feature engineering

In [25]:
def fet_engg(df):
    df['AGE+GENDER'] = df.apply(lambda x: int(x['AGE']+x['GENDER']), axis=1)
    df['AGE+DRIVING_EXPERIENCE'] = df.apply(lambda x: int(x['AGE']+x['DRIVING_EXPERIENCE']), axis=1)
    df['AGE+EDUCATION'] = df.apply(lambda x: int(x['AGE']+x['EDUCATION']), axis=1)
    df['AGE+INCOME'] = df.apply(lambda x: int(x['AGE']+x['INCOME']), axis=1)
    df['GENDER+DRIVING_EXPERIENCE'] = df.apply(lambda x: int(x['GENDER']+x['DRIVING_EXPERIENCE']), axis=1)
    df['AGE+GENDER+EDUCATION'] = df.apply(lambda x: int(x['AGE']+x['GENDER']+x['EDUCATION']), axis=1)
    df['AGE+GENDER+SPEEDING_VIOLATIONS'] = df.apply(lambda x: int(x['AGE']+x['GENDER']+x['SPEEDING_VIOLATIONS']), axis=1)
    df['AGE+GENDER+DUIS'] = df.apply(lambda x: int(x['AGE']+x['GENDER']+x['DUIS']), axis=1)
    df['AGE+GENDER+PAST_ACCIDENTS'] = df.apply(lambda x: int(x['AGE']+x['GENDER']+x['PAST_ACCIDENTS']), axis=1)
    df['VEHICLE_YEAR+TYPE_OF_VEHICLE'] = df.apply(lambda x: int(x['VEHICLE_YEAR']+x['TYPE_OF_VEHICLE']), axis=1)
    df['GENDER+MARRIED+CHILDREN'] = df.apply(lambda x: int(x['GENDER']+x['MARRIED']+x['CHILDREN']), axis=1)
    df['SPEEDING_VIOLATIONS+DUIS+PAST_ACCIDENTS'] = df.apply(lambda x: int(x['SPEEDING_VIOLATIONS']+x['DUIS']+x['PAST_ACCIDENTS']), axis=1)
    df['VEHICLE_OWNERSHIP+PAST_ACCIDENTS'] = df.apply(lambda x: int(x['VEHICLE_OWNERSHIP']+x['PAST_ACCIDENTS']), axis=1)
    df['CREDIT_SCORE_x_ANNUAL_MILEAGE'] = df['CREDIT_SCORE'] * df['ANNUAL_MILEAGE']
    df['CREDIT_SCORE_x_ID'] = df['CREDIT_SCORE'] * df['ID']
    df['ANNUAL_MILEAGE_x_ID'] = df['ANNUAL_MILEAGE'] * df['ID']
    return df

In [26]:
train = fet_engg(train)
test = fet_engg(test)

In [27]:
train.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,TYPE_OF_VEHICLE,US_STATE,POSTAL_CODE0,POSTAL_CODE1,POSTAL_CODE2,POSTAL_CODE3,POSTAL_CODE4,AGE+GENDER,AGE+DRIVING_EXPERIENCE,AGE+EDUCATION,AGE+INCOME,GENDER+DRIVING_EXPERIENCE,AGE+GENDER+EDUCATION,AGE+GENDER+SPEEDING_VIOLATIONS,AGE+GENDER+DUIS,AGE+GENDER+PAST_ACCIDENTS,VEHICLE_YEAR+TYPE_OF_VEHICLE,GENDER+MARRIED+CHILDREN,SPEEDING_VIOLATIONS+DUIS+PAST_ACCIDENTS,VEHICLE_OWNERSHIP+PAST_ACCIDENTS,CREDIT_SCORE_x_ANNUAL_MILEAGE,CREDIT_SCORE_x_ID,ANNUAL_MILEAGE_x_ID
0,816393,2,1,2,2,2,0.64,0,1,0,0,11000,0,0,0,0,3,10,3,7,3,7,9,3,4,4,4,3,5,3,3,3,4,1,0,0,7018.55,520899.28,8980323000
1,251762,1,0,2,1,2,0.48,1,0,1,0,9000,0,0,0,1,2,32,1,0,2,3,8,1,3,2,3,2,2,1,1,1,2,1,0,1,4281.66,119773.38,2265858000
2,481952,2,0,2,0,2,0.84,1,0,1,1,12000,0,0,0,1,1,32,1,0,2,3,8,2,4,2,4,2,2,2,2,2,1,2,0,1,10077.8,404751.33,5783424000
3,3506,2,0,2,1,3,0.68,1,0,0,1,6000,1,0,0,1,1,5,9,2,0,9,9,2,4,3,5,2,3,3,2,2,1,1,1,1,4095.16,2392.94,21036000
4,498013,2,1,2,0,1,0.57,1,1,1,1,15000,0,0,1,0,1,9,3,2,1,2,2,3,4,2,3,3,3,3,3,4,2,3,1,2,8582.76,284955.17,7470195000


In [28]:
test.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,TYPE_OF_VEHICLE,US_STATE,POSTAL_CODE0,POSTAL_CODE1,POSTAL_CODE2,POSTAL_CODE3,POSTAL_CODE4,AGE+GENDER,AGE+DRIVING_EXPERIENCE,AGE+EDUCATION,AGE+INCOME,GENDER+DRIVING_EXPERIENCE,AGE+GENDER+EDUCATION,AGE+GENDER+SPEEDING_VIOLATIONS,AGE+GENDER+DUIS,AGE+GENDER+PAST_ACCIDENTS,VEHICLE_YEAR+TYPE_OF_VEHICLE,GENDER+MARRIED+CHILDREN,SPEEDING_VIOLATIONS+DUIS+PAST_ACCIDENTS,VEHICLE_OWNERSHIP+PAST_ACCIDENTS,CREDIT_SCORE_x_ANNUAL_MILEAGE,CREDIT_SCORE_x_ID,ANNUAL_MILEAGE_x_ID
0,303713,0,0,1,1,2,0.42,1,0,1,0,13000,0,0,0,3,32,1,0,2,3,8,0,1,1,2,1,1,0,0,0,3,1,0,1,5524.46,129065.4,3948269000
1,141107,0,0,2,1,1,0.5,0,0,1,1,14000,0,0,0,0,32,1,0,2,3,8,0,2,1,1,2,1,0,0,0,0,2,0,0,7048.69,71044.22,1975498000
2,447316,1,1,2,1,3,0.25,1,0,1,0,10000,0,0,0,2,32,1,0,2,3,8,2,3,2,4,3,3,2,2,2,2,2,0,1,2522.46,112833.5,4473160000
3,196066,0,1,2,2,2,0.46,1,0,1,1,7000,0,1,0,0,32,1,0,2,3,8,1,2,2,2,3,3,1,2,1,0,3,1,1,3253.24,91121.39,1372462000
4,179947,0,0,2,1,1,0.45,1,1,0,1,13000,1,0,0,0,32,1,0,2,3,8,0,2,1,1,2,1,1,0,0,1,1,1,1,5888.59,81510.3,2339311000


In [29]:
X_train=train.drop('OUTCOME',axis=1)
X_test=test
y_train=train['OUTCOME']

# Scaling data

In [30]:
scaler= MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

# Model Building

In [31]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [32]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_train_pred = xgb_clf.predict(X_train)

print_score(y_train, y_train_pred, train=True)

Train Result:
Accuracy Score: 64.28%
_______________________________________________
CLASSIFICATION REPORT:
                 0        1  accuracy  macro avg  weighted avg
precision     0.62     0.84      0.64       0.73          0.71
recall        0.97     0.19      0.64       0.58          0.64
f1-score      0.76     0.31      0.64       0.54          0.57
support   50367.00 36918.00      0.64   87285.00      87285.00
_______________________________________________
Confusion Matrix: 
 [[48976  1391]
 [29789  7129]]



In [33]:
rf_clf = RandomForestClassifier(n_estimators=100)

rf_clf.fit(X_train, y_train)

y_train_pred = rf_clf.predict(X_train)

print_score(y_train, y_train_pred, train=True)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0        1  accuracy  macro avg  weighted avg
precision     1.00     1.00      1.00       1.00          1.00
recall        1.00     1.00      1.00       1.00          1.00
f1-score      1.00     1.00      1.00       1.00          1.00
support   50367.00 36918.00      1.00   87285.00      87285.00
_______________________________________________
Confusion Matrix: 
 [[50367     0]
 [    0 36918]]



In [34]:
y_test_pred = rf_clf.predict(X_test)

In [44]:
# make dataframe
df_result=pd.DataFrame(data=y_test_pred,
                      index=test.index,
                      columns=['OUTCOME'])

df_result=pd.concat([test.ID,df_result],axis=1)
df_result.head()

df_result.to_csv('result1.csv', index=False)

In [45]:
y_test_pred = xgb_clf.predict(X_test)

In [46]:
# make dataframe
df_result=pd.DataFrame(data=y_test_pred,
                      index=test.index,
                      columns=['OUTCOME'])

df_result=pd.concat([test.ID,df_result],axis=1)
df_result.head()

df_result.to_csv('result2.csv', index=False)

In [47]:
import gc
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers

np.random.seed(42)
tf.random.set_seed(42)
plt.style.use('fivethirtyeight')

In [48]:
def neural_nets(cat_cols, num_cols):
    inputs = []
    outputs = []
    
    for c in cat_cols:
        num_unique_values = int(train[c].max()) + 1
        embed_dim = int(min(np.ceil((num_unique_values)/2), 20))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c.replace('+','_'))(inp)
        out = layers.SpatialDropout1D(0.2)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x_input = layers.Input(shape=(len(num_cols),))
    x = layers.Dense(units=16, activation='selu')(x_input)
    outputs.append(x)
    
    x1 = layers.Concatenate()(outputs)
    x1 = layers.BatchNormalization()(x1)
    
    x2 = layers.Dense(units=384, activation='selu')(x1)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Dropout(rate=0.4)(x2)
    
    x2 = layers.Dense(units=192, activation='selu')(x2)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Dropout(rate=0.3)(x2)
    
    x3 = layers.Dense(units=192, activation='selu')(x1)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Dropout(rate=0.4)(x3)
    
    x3 = layers.Dense(units=96, activation='selu')(x3)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Dropout(rate=0.3)(x3)
    
    x = layers.Concatenate()([x2, x3])
    
    x = layers.Dense(units=64, activation='selu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(rate=0.2)(x)
    
    x_output = layers.Dense(units=1, activation='sigmoid')(x)

    model = models.Model(inputs=[inputs, x_input], 
                         outputs=x_output, 
                         name='Analytics_Olympiad_Model')
    return model

In [None]:
features = test.columns.tolist()
len(features)

In [49]:
cat_cols = ['AGE','GENDER','DRIVING_EXPERIENCE','EDUCATION','INCOME',
            'VEHICLE_OWNERSHIP','VEHICLE_YEAR','MARRIED','CHILDREN',
            'DUIS','TYPE_OF_VEHICLE','POSTAL_CODE0','POSTAL_CODE1',
            'POSTAL_CODE2','POSTAL_CODE3','POSTAL_CODE4','US_STATE',
            'AGE+GENDER','AGE+DRIVING_EXPERIENCE','AGE+EDUCATION',
            'AGE+INCOME','GENDER+DRIVING_EXPERIENCE','AGE+GENDER+EDUCATION',
            'VEHICLE_YEAR+TYPE_OF_VEHICLE','GENDER+MARRIED+CHILDREN',
            'AGE+GENDER+SPEEDING_VIOLATIONS','AGE+GENDER+PAST_ACCIDENTS',
            'AGE+GENDER+DUIS','VEHICLE_OWNERSHIP+PAST_ACCIDENTS']

num_cols = [col for col in features if col not in cat_cols]
len(cat_cols), len(num_cols)

NameError: name 'features' is not defined