In [69]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

pd.options.display.max_columns = 100

In [70]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train_features.shape, test_features.shape, train_labels.shape, sample_submission.shape

((59400, 40), (14358, 40), (59400, 2), (14358, 2))

In [71]:
train = train_features.copy()

In [72]:
year_mean = train[train['construction_year']>0]['construction_year'].mean()
year_mean = round(year_mean)

test_year_mean = test_features[test_features['construction_year']>0]['construction_year'].mean()
test_year_mean = round(test_year_mean)


train.loc[train['construction_year']==0, 'construction_year'] = int(year_mean)
test_features.loc[test_features['construction_year']==0,'construction_year'] = int(test_year_mean)

In [107]:
import random

def random_std(year):

    return year - random.randint(-10,10)

def random_tsh(amount):

    return amount + random.uniform(-1062.35, 1957.82)

In [109]:
random_tsh(200)

56.4787020157396

In [93]:
train_tsh_mean = train[train['amount_tsh']>0]['amount_tsh'].mean()
test_tsh_mean = test_features[test_features['amount_tsh']>0]['amount_tsh'].mean()

train.loc[train['amount_tsh']==0,'amount_tsh']=float(train_tsh_mean)
test_features.loc[test_features['amount_tsh']==0,'amount_tsh']=float(test_tsh_mean)

In [110]:
# train.loc[train['construction_year']==1997,'construction_year'].apply(random_std(),axis=1)
# test_features.loc[test_features['construction_year']==1997,'construction_year'].apply(random_std(),axis=1)
# train.loc[train['amount_tsh']==train_tsh_mean, 'amount_tsh'].apply(random_tsh(),axis=1)
# test_features.loc[test_features['amount_tsh']==test_tsh_mean, 'amount_tsh'].apply(random_tsh(),axis=1)

# train.shape, test_features.shape

In [120]:
train.loc[train['construction_year']==1997,'construction_year'].apply(random_std)
test_features.loc[test_features['construction_year']==1997,'construction_year'].transform(random_std)
train.loc[train['amount_tsh']==train_tsh_mean, 'amount_tsh'].transform(random_tsh)
test_features.loc[test_features['amount_tsh']==test_tsh_mean, 'amount_tsh'].transform(random_tsh)

train.shape, test_features.shape

((59400, 40), (14358, 40))

In [79]:
train.describe(exclude=np.number)

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
count,59400,55765,55745,59400,59400,59029,59400,59400,59400,56066,59400,55523,31234,56344,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400,59400
unique,356,1897,2145,37400,9,19287,21,125,2092,2,1,12,2696,2,18,13,7,12,5,7,7,8,6,5,5,10,7,3,7,6
top,2011-03-15,Government Of Tanzania,DWE,none,Lake Victoria,Madukani,Iringa,Njombe,Igosi,True,GeoData Consultants Ltd,VWC,K,True,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
freq,572,9084,17402,3563,10248,508,5294,2503,307,51011,59400,36793,682,38852,26780,26780,26780,40507,52490,25348,25348,50818,50818,33186,33186,17021,17021,45794,28522,34625


In [122]:
train['construction_year'].value_counts()

1997    21353
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [24]:
train['funder'].fillna('?',inplace=True)
train['installer'].fillna('?',inplace=True)
train['subvillage'].fillna('?',inplace=True)
train['public_meeting'].fillna('?',inplace=True)
train['scheme_management'].fillna('?',inplace=True)
train['scheme_name'].fillna('?',inplace=True)
train['permit'].fillna('?',inplace=True)
test_features['funder'].fillna('?',inplace=True)
test_features['installer'].fillna('?',inplace=True)
test_features['subvillage'].fillna('?',inplace=True)
test_features['public_meeting'].fillna('?',inplace=True)
test_features['scheme_management'].fillna('?',inplace=True)
test_features['scheme_name'].fillna('?',inplace=True)
test_features['permit'].fillna('?',inplace=True)

In [25]:
train['age'] = (2019 - train['construction_year']).astype(int)
test_features['age'] = (2019 - test_features['construction_year']).astype(int)

In [26]:
train_days_since = np.array(train['date_recorded'].values, dtype='datetime64')
test_days_since = np.array(test_features['date_recorded'].values,dtype='datetime64')

train_birth = train['construction_year'].astype(str)
test_birth = test_features['construction_year'].astype(str)

In [27]:
train_years_since = np.datetime_as_string(train_days_since, unit='Y')
test_years_since = np.datetime_as_string(test_days_since, unit='Y')


In [28]:
train_years_since = train_years_since.astype(int)
test_years_since = test_years_since.astype(int)
train_birth = train_birth.astype(int)
test_birth = test_birth.astype(int)

in_train_years = []
in_test_years = []

for i in range(0,len(train_years_since)):
    x = train_years_since[i] - train_birth[i]
    in_train_years.append(x)
    

for i in range(0,len(test_years_since)):
    x = test_years_since[i] - test_birth[i]
    in_test_years.append(x)

In [29]:
train['years_until_record'] = in_train_years
test_features['years_until_record'] = in_test_years

In [30]:
drop_these=[
    'date_recorded',
    'wpt_name',
    'recorded_by',
    'lga',
    'ward',
    'scheme_name', 
    'scheme_management',
    'funder',
    'installer',
    'num_private',
    'subvillage',
    'basin',
    'longitude',
    'latitude',
    'waterpoint_type_group',
    'extraction_type_group',
    'extraction_type_class',
    'management_group', 
]
train.drop(columns=drop_these,inplace=True)
test_features.drop(columns=drop_these,inplace=True)

train.shape, test_features.shape

((59400, 23), (14358, 23))

In [37]:
str_cols = train.select_dtypes(include=[object])

In [40]:
hashing = ce.HashingEncoder(
    cols = list(str_cols.columns),
    verbose=1,
    drop_invariant=True,
    return_df=True,
)

train_features = hashing.fit_transform(train)
test_features = hashing.fit_transform(test_features)

train_features.shape, test_features.shape

((59400, 17), (14358, 17))

In [41]:
train_features

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,id,amount_tsh,gps_height,region_code,district_code,population,construction_year,age,years_until_record
0,3,0,0,2,3,0,4,2,69572,6000.000000,1390,11,5,109,1999,20,12
1,4,0,2,1,3,2,2,0,8776,1062.351942,1399,20,2,280,2010,9,3
2,6,1,0,0,3,1,3,0,34310,25.000000,686,21,4,250,2009,10,4
3,7,0,0,2,4,0,1,0,67743,1062.351942,263,90,63,58,1986,33,27
4,4,0,2,3,3,1,0,1,19728,1062.351942,0,18,1,0,1997,22,14
5,3,1,1,2,2,0,3,2,9944,20.000000,0,4,8,1,2009,10,2
6,4,0,0,3,4,0,3,0,19816,1062.351942,0,17,3,0,1997,22,15
7,0,0,2,2,3,2,3,2,54551,1062.351942,0,17,3,0,1997,22,15
8,3,0,0,7,3,0,0,1,53934,1062.351942,0,14,6,0,1997,22,15
9,4,0,0,2,3,0,3,2,46144,1062.351942,0,18,1,0,1997,22,14
