In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import os
print(os.listdir("../input"))

## Loading Data

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

print("Shape of training set: {}".format(train.shape))
print("Shape of training set: {}".format(test.shape))

train.head()

# Feature Engineering

In [None]:
# get dependency from the square
train['dependency'] = train['SQBdependency'].apply(lambda x: np.sqrt(x))
train['dependency'].describe()

# get dependency from the square
test['dependency'] = test['SQBdependency'].apply(lambda x: np.sqrt(x))
test['dependency'].describe()

In [None]:
## Filter out for ONLY heads of household in TRAINING DATA
#train = train[train['parentesco1']==1]

# Check no duplicates
print("Head of house: {}".format(train.shape[0]))
print("Number of unique house IDs: {}".format(len(train['idhogar'].unique())))

In [None]:
# drop a SQUARE columns the we won't use
keep_cols = [col for col in train.columns if col[:3] != 'SQB']
keep_cols = [item for item in keep_cols if item != 'agesq']
keep_cols

train = train[keep_cols]
test = test[keep_cols[0:-1]]

print("Columns in Training set: {}".format(train.shape[1]))
print("Columns in Test set: {}".format(test.shape[1]))


### Create some new columns & Change others

In [None]:
# children per adult
train['child_per_adult'] = train['hogar_nin'] / train['hogar_adul']
test['child_per_adult'] = test['hogar_nin'] / test['hogar_adul']

# rooms per person
train['room_per_person'] = train['rooms'] / train['tamviv']
test['room_per_person'] = test['rooms'] / test['tamviv']



In [None]:
#walls and roof bad
train.loc[(train['epared1'] == 1) & (train['etecho1'] == 1), 'bad_walls_roof'] = 1
test.loc[(test['epared1'] == 1) & (test['etecho1'] == 1), 'bad_walls_roof'] = 1

#bad walls, roof and floor
train.loc[(train['bad_walls_roof'] == 1) & (train['eviv1'] == 1), 'bad_walls_roof_floor'] = 1
test.loc[(test['bad_walls_roof'] == 1) & (test['eviv1'] == 1), 'bad_walls_roof_floor'] = 1

# no electricity or water inside
train.loc[(train['abastaguadentro'] != 1) & (train['noelec'] == 1), 'no_elec_or_water'] = 1
test.loc[(test['abastaguadentro'] != 1) & (test['noelec'] == 1), 'no_elec_or_water'] = 1

#has bathroom and fridge
train.loc[(train['v14a'] == 1) & (train['refrig'] == 1), 'bath_and_fridge'] = 1
test.loc[(test['v14a'] == 1) & (test['refrig'] == 1), 'bath_and_fridge'] = 1

#has computer and TV
train.loc[(train['computer'] == 1) & (train['television'] == 1), 'pc_and_tv'] = 1
test.loc[(test['computer'] == 1) & (test['television'] == 1), 'pc_and_tv'] = 1


train['bad_walls_roof'] = train['bad_walls_roof'].fillna(0)
train['bad_walls_roof_floor'] = train['bad_walls_roof_floor'].fillna(0)
train['no_elec_or_water'] = train['no_elec_or_water'].fillna(0)
train['bath_and_fridge'] = train['bath_and_fridge'].fillna(0)
train['pc_and_tv'] = train['pc_and_tv'].fillna(0)

test['pc_and_tv'] = test['pc_and_tv'].fillna(0)
test['bath_and_fridge'] = test['bath_and_fridge'].fillna(0)
test['bad_walls_roof'] = test['bad_walls_roof'].fillna(0)
test['bad_walls_roof_floor'] = test['bad_walls_roof_floor'].fillna(0)
test['no_elec_or_water'] = test['no_elec_or_water'].fillna(0)


In [None]:
### Reworked my original handling of this feature based on https://www.kaggle.com/skooch/xgboost
# fill "no"s for education with 0s
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

# if education is "yes" and person is head of household, fill with escolari
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "edjefa"] = train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1), "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "edjefe"] = train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1), "escolari"]

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

# this field is supposed to be interaction between gender and escolari, but it isn't clear what "yes" means, let's fill it with 4
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4

test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

# convert to int for our models
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# create feature with max education of either head of household
train['HoH_EduMax'] = np.max(train[['edjefa','edjefe']], axis=1)
test['HoH_EduMax'] = np.max(test[['edjefa','edjefe']], axis=1)

# fill some nas
train['v2a1']=train['v2a1'].fillna(-1)
test['v2a1']=test['v2a1'].fillna(-1)

train['qmobilephone']=train['qmobilephone'].fillna(0)
test['qmobilephone']=test['qmobilephone'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

In [None]:
# remove columns we identifed as should be null
train = train[train.meaneduc.isnull() == False]

In [None]:
train.shape

In [None]:
### ONLY want to scale numerical columns
NUMERICAL_COLUMNS = ['v2a1','hacdor','hacapo','rooms', 'bedrooms', 'overcrowding', 'dependency', 'meaneduc','v18q1','r4h1','r4h2','r4h3','r4m1','r4m2','r4m3','r4t1','r4t2','r4t3','hogar_adul','hogar_nin','hogar_mayor','hogar_total','room_per_person','HoH_EduMax', 'age', 'child_per_adult', 'qmobilephone']

In [None]:
len(NUMERICAL_COLUMNS)

#### Reverse the OHE into Categorical variables ######


In [None]:
## define the categories

WALL = ['paredblolad','paredzocalo','paredpreb','pareddes','paredmad','paredzinc','paredfibras','paredother']

FLOOR = ['pisomoscer','pisocemento','pisoother','pisonatur','pisonotiene','pisomadera']

ROOF = ['techozinc','techoentrepiso','techocane','techootro']

WATER =['abastaguadentro','abastaguafuera','abastaguano']

ELEC = ['public','planpri','noelec','coopele']

TOILET = ['sanitario1','sanitario2','sanitario3','sanitario5','sanitario6']

COOK = ['energcocinar1','energcocinar2','energcocinar3','energcocinar4']

RUBBISH = ['elimbasu1','elimbasu2','elimbasu3','elimbasu4','elimbasu5','elimbasu6']

WALL_QLTY = ['epared1', 'epared1','epared3']

ROOF_QLTY = ['etecho1','etecho2','etecho3']

FLOOR_QLTY = ['eviv1','eviv2','eviv3']

SEX = ['male', 'female']

CIVIL = ['estadocivil1','estadocivil2','estadocivil3','estadocivil4','estadocivil5','estadocivil6','estadocivil7']

H_OWNER = ['tipovivi1','tipovivi2','tipovivi3','tipovivi4','tipovivi5']

REGION = ['lugar1','lugar2','lugar3','lugar4','lugar5','lugar6']

AREA = ['area1', 'area2']


ALL_LISTS = [AREA, REGION, H_OWNER, CIVIL, SEX, FLOOR_QLTY, ROOF_QLTY, WALL_QLTY, RUBBISH, COOK, TOILET, ELEC, WATER, ROOF, FLOOR, WALL]
LIST_NAMES = ['AREA', 'REGION', 'H_OWNER', 'CIVIL', 'SEX', 'FLOOR_QLTY', 'ROOF_QLTY', 'WALL_QLTY', 'RUBBISH', 'COOK', 'TOILET', 'ELEC', 'WATER', 'ROOF', 'FLOOR', 'WALL']


In [None]:
## function to change to cat variables and delete OHE columns
def OHE_to_cat(df, lists_of_cols, names):
    
    for cols, name in zip(lists_of_cols, names):
        df[name] = (df[cols] == 1).idxmax(1)
        df[name] = df[name].apply(lambda x: cols.index(x))
        
        df = df.drop(columns=cols)
    return df

In [None]:
## Apply to train and test sets
train = OHE_to_cat(train, ALL_LISTS, LIST_NAMES)
test = OHE_to_cat(test, ALL_LISTS, LIST_NAMES)

In [None]:
REPEAT_COLS = ['v18q','tamhog', 'tamviv', 'rez_esc','hhsize','parentesco1',
               'parentesco2','parentesco3','parentesco4','parentesco5','parentesco6','parentesco7',
               'parentesco8','parentesco9','parentesco10','parentesco11', 'parentesco12',
                'idhogar','mobilephone', 'edjefa', 'edjefe', 'escolari', 'instlevel1',
              'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7',
              'instlevel8', 'instlevel9']

In [None]:
train = train.drop(columns=REPEAT_COLS)
train = train.drop(columns=['Id']) # will handle Id in TEST set differently

test = test.drop(columns=REPEAT_COLS)

In [None]:
train.shape

## Balancing

In [None]:
y_train = train["Target"]
train.Target.value_counts()

In [None]:
train.head()

In [None]:
"""
Adapted this strategy to undersample ONLY the class-4
"""
count_class_4, count_class_2, count_class_3, count_class_1 = train.Target.value_counts()
# Divide by class
train_class_1 = train[train['Target'] == 1]
train_class_2 = train[train['Target'] == 2]
train_class_3 = train[train['Target'] == 3]
train_class_4 = train[train['Target'] == 4]

train_class_1_under = train_class_1.sample(count_class_1, random_state=99)
train_class_2_under = train_class_2.sample(round(count_class_2), random_state=99)
train_class_3_under = train_class_3.sample(round(count_class_3), random_state=99)
train_class_4_under = train_class_4.sample(round(count_class_4*.25), random_state=99)


train = pd.concat([train_class_1_under, train_class_2_under, train_class_3_under, train_class_4_under], axis=0)
train.Target.value_counts()

In [None]:
y_train = train['Target']

In [None]:
train_cols = train.columns
sm = SMOTE(random_state=2)
train_res, y_train_res = sm.fit_sample(train, y_train.ravel())
train_res = pd.DataFrame(train_res, columns=train_cols)
train_res.Target.value_counts()

In [None]:
X_train = train_res.drop(['Target'], axis = 1) 

y_train = y_train_res

In [None]:
print("Final Train Shape: {}".format(X_train.shape))
print("Final TEST Shape: {}".format(test.shape))
print("Extra column is Id which will be handled later")

## Train, Dev split

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train,y_train,test_size = 0.2,random_state = 0)

## Scaling

In [None]:
sc_X = StandardScaler()

In [None]:
## Only Scaling the Numerical Columns not Binary
X_train_bin = X_train.drop(NUMERICAL_COLUMNS, axis = 1) 
X_dev_bin = X_dev.drop(NUMERICAL_COLUMNS, axis = 1) 

X_train = X_train[NUMERICAL_COLUMNS]
X_dev = X_dev[NUMERICAL_COLUMNS] 

In [None]:
### fit to training and transform traing and tes
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_dev2= pd.DataFrame(sc_X.transform(X_dev))

In [None]:
#scaler returns numpy array and lose index and columns names which we don't want!
X_train2.columns = X_train.columns.values
X_dev2.columns = X_dev.columns.values

X_train2.index = X_train.index.values
X_dev2.index = X_dev.index.values

# combine the numerical and categorical values
X_train = pd.concat([X_train2, X_train_bin],axis=1, sort=False)
X_dev = pd.concat([X_dev2,X_dev_bin],axis=1, sort=False)

In [None]:
# check shape
print(X_train.shape)
print(X_dev.shape)
print(y_train.shape)
print(y_dev.shape)

In [None]:
X_train.head(10)

## MODEL DEVELOPMENT

In [None]:
random_forest = RandomForestClassifier(n_estimators=400)

y_train = pd.DataFrame(y_train)
random_forest.fit(X_train, y_train.values.ravel())

In [None]:
### Get metrics for Dev set
y_pred = random_forest.predict(X_dev)

print("Accuracy:")
print(round(random_forest.score(X_dev, y_dev), 3))


# of predicted +ve, how many correct
print("Precision score:")
print(round(precision_score(y_dev, y_pred, average='macro'), 3))


# of all actual +ve how many did we get
print("Recall score:")
print(round(recall_score(y_dev, y_pred, average='macro'), 3))

# f1 combines
print("Global F1 score:")
print(round(f1_score(y_dev, y_pred, average='macro'), 3))

In [None]:
cm = confusion_matrix(y_dev, y_pred.round())
df_cm = pd.DataFrame(cm, index = (1,2,3,4), columns=(1,2,3,4))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)
sns.heatmap(df_cm, annot = True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label')
print('Test Data Accuracy: %0.4f' % accuracy_score(y_dev, y_pred))

## Improve model

## Test

In [None]:
print(test.shape)
print(X_train.shape)
test.head()

In [None]:
test_id = test['Id']
test = test.drop(columns=['Id'])

In [None]:
# would be important to check that these are records where there its not HoH; not that can do much..
# and 31 on 29k isn't bad
test['meaneduc']=test['meaneduc'].fillna(-1)

test_bin = test.drop(NUMERICAL_COLUMNS, axis = 1) 
test = test[NUMERICAL_COLUMNS]
test.describe()

# some inf and nan on child_per_adult
test['child_per_adult'] = test['child_per_adult'].replace([np.inf, -np.inf], np.nan)
test['child_per_adult'] = test['child_per_adult'].fillna(-1)

In [None]:
# Scale
test2 = pd.DataFrame(sc_X.transform(test))

#scaler returns numpy array and lose index and columns names which we don't want!
test2.columns = test.columns.values
test2.index = test.index.values

# combine the numerical and categorical values
test = pd.concat([test2, test_bin],axis=1, sort=False)
print(test.shape)
test.head()

In [None]:
# predict values
test_pred = random_forest.predict(test)
test_pred = pd.DataFrame(test_pred)

my_preds = pd.concat([test_id, test_pred],axis=1, sort=False)
my_preds.columns = ['Id', 'Target']
my_preds.tail()

In [None]:
my_preds.to_csv('190128_g_submission.csv', index=False)