# I. Import libraries

In [None]:
#Import libraries
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
# Pandas configurations 
sns.set()
%matplotlib inline
#pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

# II. Import dataset

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist




#Import data
print('Loading data...')

train_2016 = pd.read_csv('../input/zillow-prize-1/train_2016_v2.csv', low_memory=False)
properties_2016, NAlist_2016 = reduce_mem_usage(pd.read_csv('../input/zillow-prize-1/properties_2016.csv',low_memory=False))
train_2017 = pd.read_csv('../input/zillow-prize-1/train_2017.csv',low_memory=False)
properties_2017, NAlist_2017 = reduce_mem_usage(pd.read_csv('../input/zillow-prize-1/properties_2017.csv',low_memory=False))



# III. Data exploratory

## III.1 Merging the data

In [None]:
properties_2016.head()

In [None]:
#Merging properties with the train dataset for exploratory analysis
print('Merging the data...')

df_train_2016 = train_2016.merge(properties_2016, how='left', on='parcelid')
df_train_2017 = train_2017.merge(properties_2017, how='left', on='parcelid')

full_df = pd.concat([df_train_2017,df_train_2016])
#Check the train dataset
print('Our dataset contains {} rows and {} columns.'.format(full_df.shape[0], full_df.shape[1]))

## III.2 Data analysis

### III.2.1 Data type check

In [None]:
# Just as informations about our data types
sns.set_theme(style="whitegrid")
plt.title('Data types repartition')
full_df.dtypes.value_counts().plot.pie()
print('We can see that most of our data are numerical values')

### III.2.2 Dataset Columns

In [None]:
full_df.columns

Renaming columns for better understanding of our features as they are a bit confusing at first

In [None]:
"""
    Assign better names to all feature columns of 'properties' table
"""
def rename_columns(df):
     df.rename(columns={
          'parcelid': 'parcelid',  # Unique identifier of parcels
          'airconditioningtypeid': 'cooling_id',  # type of cooling system (if any), 1~13
          'architecturalstyletypeid': 'architecture_style_id',  # Architectural style of the home, 1~27
          'basementsqft': 'basement_sqft',  # Size of the basement
          'bathroomcnt': 'bathroom_cnt',  # Number of bathrooms (including fractional bathrooms)
          'bedroomcnt': 'bedroom_cnt',  # Number of bedrooms
          'buildingclasstypeid': 'framing_id',  # The building framing type, 1~5
          'buildingqualitytypeid': 'quality_id',  # building condition from best (lowest) to worst (highest)
          'calculatedbathnbr': 'bathroom_cnt_calc',  # Same meaning as 'bathroom_cnt'?
          'decktypeid': 'deck_id',  # Type of deck (if any)
          'finishedfloor1squarefeet': 'floor1_sqft',  # Size of finished living area on first floor
          'calculatedfinishedsquarefeet': 'finished_area_sqft_calc',  # calculated total finished living area
          'finishedsquarefeet12': 'finished_area_sqft',  # Same meaning as 'finished_area_sqft_calc'?
          'finishedsquarefeet13': 'perimeter_area',  # Perimeter living area
          'finishedsquarefeet15': 'total_area',  # Total area
          'finishedsquarefeet50': 'floor1_sqft_unk',  # Same meaning as 'floor1_sqft'?
          'finishedsquarefeet6': 'base_total_area',  # Base unfinished and finished area
          'fips': 'fips',  # Federal Information Processing Standard code
          'fireplacecnt': 'fireplace_cnt',  # Number of fireplaces in the home (if any)
          'fullbathcnt': 'bathroom_full_cnt',  # Number of full bathrooms
          'garagecarcnt': 'garage_cnt',  # Total number of garages
          'garagetotalsqft': 'garage_sqft',  # Total size of the garages
          'hashottuborspa': 'spa_flag',  # Whether the home has a hot tub or spa
          'heatingorsystemtypeid': 'heating_id',  # type of heating system, 1~25
          'latitude': 'latitude',  # latitude of the middle of the parcel multiplied by 1e6
          'longitude': 'longitude',  # longitude of the middle of the parcel multiplied by 1e6
          'lotsizesquarefeet': 'lot_sqft',  # Area of the lot in sqft
          'poolcnt': 'pool_cnt', # Number of pools in the lot (if any)
          'poolsizesum': 'pool_total_size',  # Total size of the pools
          'pooltypeid10': 'pool_unk_1',
          'pooltypeid2': 'pool_unk_2',
          'pooltypeid7': 'pool_unk_3',
          'propertycountylandusecode': 'county_landuse_code',
          'propertylandusetypeid': 'landuse_type_id' ,  # Type of land use the property is zoned for, 25 categories
          'propertyzoningdesc': 'zoning_description',  # Allowed land uses (zoning) for that property
          'rawcensustractandblock': 'census_1',
          'regionidcity': 'city_id',  # City in which the property is located (if any)
          'regionidcounty': 'county_id',  # County in which the property is located
          'regionidneighborhood': 'neighborhood_id',  # Neighborhood in which the property is located
          'regionidzip': 'region_zip',
          'roomcnt': 'room_cnt',  # Total number of rooms in the principal residence
          'storytypeid': 'story_id',  # Type of floors in a multi-story house, 1~35
          'threequarterbathnbr': 'bathroom_small_cnt',  # Number of 3/4 bathrooms
          'typeconstructiontypeid': 'construction_id',  # Type of construction material, 1~18
          'unitcnt': 'unit_cnt',  # Number of units the structure is built into (2=duplex, 3=triplex, etc)
          'yardbuildingsqft17': 'patio_sqft',  # Patio in yard
          'yardbuildingsqft26': 'storage_sqft',  # Storage shed/building in yard
          'yearbuilt': 'year_built',  # The year the principal residence was built
          'numberofstories': 'story_cnt',  # Number of stories or levels the home has
          'fireplaceflag': 'fireplace_flag',  # Whether the home has a fireplace
          'structuretaxvaluedollarcnt': 'tax_structure',
          'taxvaluedollarcnt': 'tax_parcel',
          'assessmentyear': 'tax_year',  # The year of the property tax assessment (2015 for 2016 data)
          'landtaxvaluedollarcnt': 'tax_land',
          'taxamount': 'tax_property',
          'taxdelinquencyflag': 'tax_overdue_flag',  # Property taxes are past due as of 2015
          'taxdelinquencyyear': 'tax_overdue_year',  # Year for which the unpaid propert taxes were due
          'censustractandblock': 'census_2'
     }, inplace=True)
        
rename_columns(full_df)

In [None]:
#full_df = full_df.drop('parcelid', axis=1)

In [None]:
full_df.head()

### III.2.3 Missing Values check

In [None]:
# Visualization of the  missing value per columns
plt.figure(figsize=(13, 40))
plt.rcParams['axes.facecolor'] = '#eee'
plt.rc('grid', color='#fff')
(full_df.isnull().mean(axis=0)*100).sort_values().plot.barh(color ="Lightblue")
plt.xlim(xmax=100)
plt.title("Missing values rate",fontsize=18)
plt.xlabel("percentage",fontsize=14)

We notice there are a lot of columns with more than 90 % missing values.
Let's check if our target variable has missing values.

In [None]:
print('Our target variable "logerror" has {} missing value(s)'.format(full_df['logerror'].isnull().sum()))

# IV. Splitting the dataset into the train set and the test set

In [None]:
y = full_df['logerror']
X = full_df.drop(['logerror', 'transactiondate', 'county_landuse_code', 'zoning_description'], axis=1)

#print(f"X shape: {X.shape}")
#print(f"y shape: {y.shape}")

For preprocessing purpose, I identify the numerical and object columns.

In [None]:
numeric_cols = X.select_dtypes(include=["float64","int64"]).columns
category_cols = X.select_dtypes(include="object").columns

In [None]:
for col in category_cols:
    print("Unique values of the column {} : {}".format(col, X[col].unique()))
    print("Unique values of the column {} : {}".format(col, X[col].nunique(dropna=True)))

Since spa_flag, fireplace_flag and tax_overdue_flag have only one unique value and it's either True or 'Y', we could replace the missing values with False when the unique value is True and 'N' when the unique value is 'Y'.

In [None]:
X[['spa_flag', 'fireplace_flag']] = X[['spa_flag', 'fireplace_flag']].fillna(False)
X['tax_overdue_flag'] = X['tax_overdue_flag'].fillna('N')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
X_train.head()

# VI. XGBoost model

## VI.1 Imputing missing values for numerical and categorical variables

In [None]:
# Preprocessing data
num_imp = SimpleImputer(missing_values=np.nan, strategy="constant")
# Preprocessing for categorical data
cat_imp = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer(
    transformers=[
        ('numerical', num_imp, numeric_cols),
        ('category', cat_imp, category_cols)
])
# Random forest model
model_xgboost = XGBRegressor(random_state=0)

#Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', col_trans),
    ('model', model_xgboost )
])

# Defining parameters for the GridSearchCV
parameters = {
    'model__n_estimators': [100, 120, 150, 200],
    'model__learning_rate': [0.02,0.05,0.07]
}

search = GridSearchCV(
    estimator = pipe,
    param_grid = parameters,
    cv = 3
)

# Fit the model
search.fit(X_train, y_train)

print('-----')
print(f'Best parameters {search.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{search.best_score_:.3f}'
)
print('-----')

In [None]:
print(search.score(X_train, y_train))

In [None]:
y_pred = search.predict(X_test)
y_pred = pd.DataFrame(y_pred)
y_pred.head()

In [None]:
print('Mean absolute error: ', mean_absolute_error(y_test, y_pred))

In [None]:
sample_sub = pd.read_csv('../input/zillow-prize-1/sample_submission.csv')
sample_sub['parcelid'] = sample_sub['ParcelId']


X_valid = properties_2016


sub = sample_sub.merge(X_valid, on='parcelid', how='left')

In [None]:
sub.head()

In [None]:
rename_columns(X_valid)

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)
X_valid = X_valid.drop(['county_landuse_code', 'zoning_description' ],axis=1)
predictions = search.predict(X_valid)
pd.DataFrame(predictions).head()

In [None]:
X_test.head()

In [None]:
X_valid.head()

In [None]:
sub['201610'] = predictions
sub['201611'] = predictions
sub['201612'] = predictions
sub['201710'] = predictions
sub['201711'] = predictions
sub['201712'] = predictions

In [None]:
sub.head()

In [None]:
sub = sub[['ParcelId', '201610', '201611', '201612', '201710', '201711', '201712']]
sub.head()

In [None]:
print('Writing csv ...')
sub.to_csv('xgboost_model.csv', index=False, float_format='%.4f') # Thanks to @inversion