In [270]:
# Import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor
from pandas.api.types import CategoricalDtype

# Set options
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 4000

# Import data
train = pd.read_csv("../01-data/train.csv", low_memory = False)
test = pd.read_csv("../01-data/test.csv", low_memory = False)

In [271]:
# Join train and test for processing - otherwise they end up with a different # of columns
train_labels = train['satisfied']
del train['satisfied']
train['data'] = 1
test['data'] = 0

In [272]:
all_data = train.append(test)

In [273]:
# 0,1 or 1/2 - just need to treat NA
binary = ['v6','v7','v8','v9','v10','v11','v12','v14','v15','v16','v18', 'v21','v22','v23','v24','v26','v27',
         'v28','v29','v30','v31','v32','v33','v34','v36','v37','v38','v39','v40','v41','v42','v43','v44','v45',
         'v46','v47','v48','v49','v50','v51','v52','v53','v54','v55','v77','v85','v90','v91','v92','v93','v94',
         'v106','v107','v108','v123','v152','v157','v162','v165','v166','v171','v172','v173','v175','v176','v187',
         'v188','v217','v218','v221','v241','v242','v243','v244','v245','v246','v247','v254','v256','v257']

# No order - create "other" col and treat NA
nominal = ['v4','v5','v17','v20','v25','v57','v59','v61','v63','v70','v71','v72','v73','v78','v102','v103','v150',
          'v151','v154','v155','v158','v159','v160','v161','v163','v164','v167','v169','v170','v174','v190','v191',
          'v196','v197','v198','v199','v208','v209','v210','v211','v216','v231','v248','v255','cntry']

# Just treat NA
numeric = ['v3','v64','v69','v100','v124','v125','v126','v127','v128','v129','v130','v131','v132','v134','v168',
           'v228','v229','v230','v250','v251','v252','v133']

# Has an order - create "other" col and treat NA
ordinal = ['v1','v2','v13','v19', 'v35','v56','v58','v60','v62','v65','v66','v67','v68','v74','v75','v79',
          'v80','v81','v82','v83','v84','v99','v101','v104','v105','v110','v111','v112','v113','v114','v115',
          'v116','v117','v118','v119','v122','v135','v136','v137','v138','v139','v140','v141','v142','v143',
          'v144','v145','v146','v147','v148','v149','v177','v181','v182','v185','v186','v219','v220',
          'v222','v223','v224','v225','v226','v227','v232','v233','v234','v235','v236','v237','v238','v239',
          'v240','v249','v253','v258','v263','v264','v265','v266','v76','v98','v109','v120','v121','v153','v156',
          'v178','v179','v180','v184','v189','v183']

# Drop - 5th+ member of household vars
to_drop = ['v86','v87','v88','v89','v95','v96','v97','v192','v193','v194','v195','v200','v201','v202','v203',
          'v204','v205','v206','v207','v212','v213','v214','v215','v259','v260','v261','v262','v267','v268','v269',
           'v270']

# Drop unwanted vars
all_data = all_data.drop(to_drop, axis = 1)

## process:

In [274]:
#### Binary vars - 0,1 or 1,2

# Replace NaN, ".c", ".d" with ".b" - corresponds with "No Answer"
all_data[binary] = all_data[binary].fillna(".b")
all_data[binary] = all_data[binary].replace(".c", ".b")
all_data[binary] = all_data[binary].replace(".d", ".b")

# One-hot encode them 
all_data_encoded_1 = pd.get_dummies(all_data, prefix=binary, columns=binary)

#### Nominal (no order) - create "other" column and treat NA

# Replace NaN, ".c", ".d" with ".b" - corresponds with "No Answer"
all_data_encoded_1[nominal] = all_data_encoded_1[nominal].fillna(".b")
all_data_encoded_1[nominal] = all_data_encoded_1[nominal].replace(".", ".b")
all_data_encoded_1[nominal] = all_data_encoded_1[nominal].replace(".c", ".b")
all_data_encoded_1[nominal] = all_data_encoded_1[nominal].replace(".d", ".b")

# One-hot encode them 
all_data_encoded_2 = pd.get_dummies(all_data_encoded_1, prefix=nominal, columns=nominal)

#### Numeric - Just treat NA

# v69,v168,v250,v251,252: .a corresponds to NA, i.e. 0
all_data_encoded_2['v69'] = all_data_encoded_2['v69'].replace(".a", 0)
all_data_encoded_2['v168'] = all_data_encoded_2['v168'].replace(".a", 0)
all_data_encoded_2['v250'] = all_data_encoded_2['v250'].replace(".a", 0)
all_data_encoded_2['v251'] = all_data_encoded_2['v251'].replace(".a", 0)
all_data_encoded_2['v252'] = all_data_encoded_2['v252'].replace(".a", 0)

# Now get rid of the rest of NA
all_data_encoded_2[numeric] = all_data_encoded_2[numeric].replace([".a",".b",".c",".d","."], [np.nan,np.nan,np.nan,np.nan,np.nan])

# Impute rest of the NA with mean
for col in numeric:
    all_data_encoded_2[col] = pd.to_numeric(all_data_encoded_2[col], errors = "coerce")
    colmean = np.nanmean(all_data_encoded_2[col])
    all_data_encoded_2[col] = all_data_encoded_2[col].fillna(colmean)

## TAKES REALLY LONG TO RUN
# # Now smart impute the rest

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer

# features = list(set(all_data_encoded_2.columns.tolist()) - set(binary) - set(nominal) - set(ordinal))

# imp = IterativeImputer(ExtraTreesRegressor(n_estimators=10, random_state=0),verbose=2)
# imp.fit(all_data_encoded_2[features])
# imputed_df = imp.transform(all_data_encoded_2[features])
# imputed_df = pd.DataFrame(imputed_df, columns=all_data_encoded_2[features].columns)

#### Ordinal - create "other" column, treat NA, set as category type

# Replace NaN, ".c", ".d" with ".b" - corresponds with "No Answer"
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].fillna(".b")
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].replace(".c", ".b")
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].replace(".d", ".b")

# Impute all .a with 0
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].replace(".a", 0)

# Now get rid of the rest of NA
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].replace([".b",".c",".d","."], [np.nan,np.nan,np.nan,np.nan])

# Impute all rest NA with mean
for col in ordinal:
    all_data_encoded_2[col] = pd.to_numeric(all_data_encoded_2[col], errors = "coerce")
    colmean = np.nanmean(all_data_encoded_2[col])
    all_data_encoded_2[col] = all_data_encoded_2[col].fillna(round(colmean))

# Convert to category, preserving order
cat_type = CategoricalDtype(categories=range(0,3000),ordered=True)
all_data_encoded_2[ordinal] = all_data_encoded_2[ordinal].astype(cat_type)

## Checking if any missing/nan remaining, and shape
nan_values = all_data_encoded_2.isna()
nan_columns = nan_values.any()

columns_with_nan = all_data_encoded_2.columns[nan_columns].tolist()
print(columns_with_nan)
all_data_encoded_2.shape


[]


(39325, 3469)

## split:

In [275]:
test = all_data_encoded_2[all_data_encoded_2['data']==0]
train = all_data_encoded_2[all_data_encoded_2['data']==1]
train['satisfied'] = train_labels

del test['data']
del train['data']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [277]:
test.to_csv("test_min_processing.csv")
train.to_csv("train_min_processing.csv")