In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# Reading the data

df_train = pd.read_csv("/Users/siddarthjha/Downloads/ml_data/training.csv", low_memory = False)
df_test = pd.read_csv("/Users/siddarthjha/Downloads/ml_data/public_test_features.csv", low_memory = False)
df_meta_data = pd.read_excel("/Users/siddarthjha/Downloads/ml_data/metadata-dataset.xlsx")

In [3]:
# The shape of Train and Test
df_train.shape, df_test.shape

((36803, 228), (15774, 227))

In [4]:
y_train = df_train['label']

In [5]:
# Dropping ID column
df_train.drop(['ID', 'label'], axis = 1, inplace = True)
df_test.drop(['ID'], axis = 1, inplace = True)
# calculating category & integer type columns 
cat_vars=list(df_train.select_dtypes(include=['object']).columns)
num_vars=list(df_train.select_dtypes(exclude=['object']).columns)
len(cat_vars), len(num_vars)

(126, 100)

In [6]:
# for i in df_train.columns:
#     if i in cat_vars:
#         print("Total length of cat - {0} is {1}".format(i, len(df_train[i].value_counts())))

# From this selecting the categorical features which has only 20 categories

drop_cat_vars = []
for i in cat_vars:
    if len(df_train[i].value_counts()) > 20:
        drop_cat_vars.append(i)
selected_cat_vars = list(set(cat_vars) - set(drop_cat_vars))
len(selected_cat_vars)

80

In [7]:
# Handling the null values with 0
df_train.fillna(0, inplace = True)
df_test.fillna(0, inplace = True)

In [8]:
train = pd.get_dummies(data = df_train, columns = selected_cat_vars)
test = pd.get_dummies(data = df_test, columns = selected_cat_vars)

In [9]:
train.shape, test.shape

((36803, 435), (15774, 417))

In [10]:
x_train, x_test = train.align(test, join='left', axis=1)
x_train.shape, x_test.shape

((36803, 435), (15774, 435))

In [11]:
cat_vars_new=list(x_train.select_dtypes(include=['object']).columns)
# Dropping off the unnecassary categorical features
x_train.drop(cat_vars_new, inplace = True, axis = 1)
x_test.drop(cat_vars_new, inplace = True, axis = 1)
x_train.shape, x_test.shape

((36803, 389), (15774, 389))

In [12]:
model = RandomForestClassifier(n_estimators = 500,
                            criterion = 'entropy',
                            class_weight = 'balanced',
                            min_samples_split = 1000,
                            random_state = 1,
                            n_jobs = -1) 

In [None]:
feature_cols = ['key_Region Id',
 'key_MarketPlace Id',
 'key_country_of_origin',
 'key_discontinued_date',
 'key_fedas_id',
 'key_isbn',
 'key_manufacturer_sku',
 'key_monthly_recurring_charge',
 'key_number_of_licenses',
 'key_number_of_pages',
 'key_number_of_points',
 'key_preferred_vendor',
 'key_publisher',
 'key_recall_external_identifier',
 'key_recall_notice_expiration_date',
 'key_recall_notice_publication_date',
 'key_recall_notice_receive_date',
 'cand_Region Id',
 'cand_MarketPlace Id',
 'cand_country_of_origin',
 'cand_discontinued_date',
 'cand_esrb_descriptors',
 'cand_fedas_id',
 'cand_manufacturer_sku',
 'cand_monthly_recurring_charge',
 'cand_number_of_pages',
 'cand_number_of_points',
 'cand_preferred_vendor',
 'cand_publisher',
 'cand_recall_external_identifier',
 'cand_recall_notice_expiration_date',
 'cand_recall_notice_publication_date',
 'cand_recall_notice_receive_date',
 'cand_currency_code_USD',
 'cand_item_display_volume_uom_cubic_centimeters',
 'cand_item_display_volume_uom_cubic_meters',
 'cand_item_display_volume_uom_cups',
 'cand_item_display_volume_uom_gallons',
 'cand_item_display_volume_uom_grams',
 'cand_item_display_volume_uom_ounces',
 'cand_item_display_volume_uom_pints',
 'cand_item_display_volume_uom_quarts',
 'key_currency_code_USD',
 'cand_video_game_region_description_0',
 'cand_video_game_region_description_ntsc_uc',
 'cand_video_game_region_description_region_free',
 'cand_is_discontinued_N',
 'cand_is_manufacture_on_demand_N',
 'key_language_code_en_US',
 'key_inner_package_type_0',
 'key_inner_package_type_41_deluxe_bangle_hoops',
 'cand_wireless_provider_code_cingular',
 'cand_wireless_provider_code_sprintpcs',
 'cand_wireless_provider_code_t_mobile',
 'cand_wireless_provider_code_virgin_mobile',
 'key_esrb_age_rating_everyone_10_plus',
 'key_mfg_series_number_GENESIS 330',
 'key_mfg_series_number_PIXMA',
 'key_mfg_series_number_SoundSport',
 'cand_classification_description_0',
 'cand_classification_description_Collection Parent',
 'cand_customer_return_method_free_return',
 'cand_customer_return_method_mail',
 'cand_customer_return_method_unknown',
 'cand_classification_code_0',
 'cand_classification_code_collection_parent',
 'cand_program_member_Advantage',
 'cand_program_member_Caterpillar',
 'cand_program_member_Create With Amazon',
 'cand_program_member_Invite Model',
 'cand_is_super_saver_shipping_excl_Y',
 'cand_is_advantage_N',
 'cand_is_advantage_Y',
 'key_is_advantage_N',
 'cand_item_display_weight_uom_milligrams',
 'cand_release_date_embargo_level_low',
 'cand_release_date_embargo_level_none',
 'cand_release_date_embargo_level_strict',
 'cand_inner_package_type_38_deluxe_med',
 'cand_inner_package_type_39_deluxe_bracelet',
 'cand_inner_package_type_41_deluxe_bangle_hoops',
 'cand_inner_package_type_42_classic_sml',
 'cand_inner_package_type_43_classic_med',
 'cand_inner_package_type_44_oversized_med',
 'key_has_online_play_N',
 'cand_item_display_length_uom_feet',
 'key_is_discontinued_N',
 'key_is_super_saver_shipping_excl_Y',
 'cand_language_code_en_US',
 'key_is_manufacture_on_demand_N',
 'cand_wireless_provider_Sprint',
 'cand_wireless_provider_T-Mobile',
 'cand_wireless_provider_Virgin Mobile',
 'cand_is_phone_upgradeable_N',
 'cand_has_online_play_N',
 'cand_has_online_play_Y',
 'cand_program_member_code_advantage',
 'cand_program_member_code_caterpillar',
 'cand_program_member_code_create_with_amazon',
 'cand_program_member_code_invite_model',
 'cand_esrb_age_rating_0',
 'cand_esrb_age_rating_everyone_10_plus',
 'cand_esrb_age_rating_kids_to_adults',
 'cand_esrb_age_rating_mature',
 'cand_esrb_age_rating_rating_pending',
 'cand_esrb_age_rating_teen',
 'cand_fma_override_0',
 'cand_fma_override_bypass_buybox_overpriced_filter',
 'cand_cpsia_cautionary_statement_choking_hazard_contains_a_marble',
 'cand_cpsia_cautionary_statement_choking_hazard_contains_small_ball',
 'cand_cpsia_cautionary_statement_choking_hazard_is_a_marble',
 'cand_cpsia_cautionary_statement_choking_hazard_is_a_small_ball',
 'cand_cpsia_cautionary_statement_contains_small_magnets',
 'key_ordering_channel_jewelry_ordering_channel',
 'key_ordering_channel_software_ordering_channel']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train1, y_test = train_test_split(x_train[feature_cols], y_train, test_size = 0.30)


In [22]:
%%time
model.fit(X_train, y_train1)

CPU times: user 26 s, sys: 195 ms, total: 26.2 s
Wall time: 4.59 s


RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       min_samples_split=1000, n_estimators=500, n_jobs=-1,
                       random_state=1)

In [23]:
y_pred = model.predict(X_test)

In [24]:
metrics.confusion_matrix(y_test, y_pred)

array([[3659, 1859],
       [2327, 3196]])

In [25]:
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("Recall OF THE MODEL: ", metrics.recall_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.6208676750294357
Recall OF THE MODEL:  0.5786710121310882


In [18]:
feature_columns = x_train.columns

In [19]:
importances = list(100*model.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_columns, importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


Variable: key_variation_theme_id Importance: 4.72
Variable: cand_fma_qualified_price_max Importance: 4.44
Variable: key_product_type_id  Importance: 4.08
Variable: cand_pkg_width       Importance: 3.66
Variable: key_fma_qualified_price_max Importance: 3.29
Variable: cand_pkg_weight      Importance: 3.2
Variable: key_Product Group Code Importance: 3.08
Variable: key_pkg_length       Importance: 2.97
Variable: key_pkg_weight       Importance: 2.91
Variable: cand_pkg_height      Importance: 2.89
Variable: cand_version         Importance: 2.82
Variable: cand_product_type_id Importance: 2.69
Variable: key_pkg_width        Importance: 2.64
Variable: key_pkg_height       Importance: 2.52
Variable: cand_Product Group Code Importance: 2.43
Variable: cand_pkg_length      Importance: 2.07
Variable: cand_variation_theme_id Importance: 1.87
Variable: key_version          Importance: 1.77
Variable: key_upc              Importance: 1.44
Variable: key_item_weight      Importance: 1.32
Variable: key_it

In [20]:
feature_cols = []
for p in feature_importances:
    if p[1] > 0.09:
        feature_cols.append(p[0])
feature_cols

['key_variation_theme_id',
 'cand_fma_qualified_price_max',
 'key_product_type_id',
 'cand_pkg_width',
 'key_fma_qualified_price_max',
 'cand_pkg_weight',
 'key_Product Group Code',
 'key_pkg_length',
 'key_pkg_weight',
 'cand_pkg_height',
 'cand_version',
 'cand_product_type_id',
 'key_pkg_width',
 'key_pkg_height',
 'cand_Product Group Code',
 'cand_pkg_length',
 'cand_variation_theme_id',
 'key_version',
 'key_upc',
 'key_item_weight',
 'key_item_length',
 'key_item_width',
 'cand_case_pack_quantity',
 'cand_item_weight',
 'cand_upc',
 'key_item_height',
 'cand_Is Sortable_N',
 'cand_Is Sortable_Y',
 'cand_item_width',
 'cand_customer_return_policy_0',
 'cand_item_height',
 'cand_number_of_items',
 'key_ordering_channel_watch_ordering_channel',
 'key_ean',
 'key_item_display_weight',
 'key_excluded_direct_browse_node_id',
 'cand_item_length',
 'cand_excluded_direct_browse_node_id',
 'key_Is Sortable_Y',
 'cand_ean',
 'key_Is Sortable_N',
 'cand_item_display_weight',
 'key_case_pack_

In [30]:
y_pred_proba = (model.predict_proba(X_test)[:,1] >= 0.4).astype(int)
print("Accuracy and recall : {0}, {1}".format(metrics.accuracy_score(y_test, y_pred_proba), metrics.recall_score(y_test, y_pred_proba)))

Accuracy and recall : 0.5254958789964677, 0.9808075321383306


In [32]:
cm_proba = metrics.confusion_matrix(y_test, y_pred_proba)
cm_proba

array([[ 385, 5133],
       [ 106, 5417]])