In [1]:
# Importing all the needed libraries for this project
import numpy as np
import pandas as pd
import xgboost as xgb


In [2]:
# Loading the training data set to a DataFrame using pandas.
# The DataFrame structure makes exploring the data easy.

training_X = pd.read_csv('data/traindata.csv')
training_y = pd.read_csv('data/traindata_label.csv')
test_X = pd.read_csv('data/testdata.csv')

# Preprocessing the data

We can see that the following columns contain categorical features: type and region. 
Also "Date" is not in a form that is good for machine learning.

We should consider encoding these features into a different form.

For features 'type' and 'region' our method is straightforward: we perform dummy encoding on them.

For the feature 'date', I hypothesize that seasonal data would provide better results, when it comes to prices of fruit. (Fruits can be in- our out of season.) So we turn each date to represent one of the seasons of the year, and replace the date data with this newly created 'season' data. This seasonal data is then dummy encoded.

In [3]:
# For categorical values, are good way to include them in the analysis is either 
# dummy- or one hot encoding.
# pandas provides a function for this through "get_dummies".

# Turns dates in to the season, in which they take place.
# For example: 05.01.2018 => 'Winter'
def dateToSeason (dates):
    seasons = []
    for day in dates:
        if day[5:7] in ['12', '01', '02']:
            seasons.append('Winter')
        elif day[5:7] in ['03','04','05']:
            seasons.append('Spring')
        elif day[5:7] in ['06','07','08']:
            seasons.append('Summer')
        else:
            seasons.append('Fall')
    return seasons


            
# Preprocessing for our features.
# Mainly this function handles encoding categorical features into numerical data.
def preprocessing (X):
    # Creates dummy-encoding of feature 'type'.
    type_encoded = pd.get_dummies(X['type'], prefix='type',drop_first=True)

    # Then we concatenate the encoded values to our data.
    X = pd.concat([X, type_encoded], axis=1)

    #Finally we can drop the categorical column "type" from our DataFrame.
    X.drop(['type'],axis=1, inplace=True)


    # Same for "region"
    X = pd.concat([X, pd.get_dummies(X['region'], prefix='region',drop_first=True)], axis=1)
    X.drop(['region'],axis=1, inplace=True)

    # Turn dates into seasons and drop date info altogether, replacing it with season data.
    dates = X['Date'].values.tolist()
    seasons = dateToSeason(dates)
    X['season'] = seasons
    
    
    X.drop(['Date'], axis=1, inplace=True)
    X = pd.concat([X, pd.get_dummies(X['season'], prefix='season',drop_first=False)], axis=1)
    X.drop(['season'],axis=1, inplace=True)
    
    return X


In [4]:
# Here the preprocessing steps described in the above functions are run on our data.
training_X = preprocessing(training_X)
# The data is sorted, so that the same order is preserved between training and testing data.
training_X = training_X.reindex(sorted(training_X.columns), axis=1)

test_X = preprocessing(test_X)

# Test data doesn't include Fall and Summer values, so the columns have to be added manually into the features.
test_X['season_Fall'] = 0
test_X['season_Summer'] = 0
test_X  = test_X.reindex(sorted(test_X.columns), axis=1)


# Training the model

After processing our data, the machine learning model is trained. For this project I decided to use gradient boosting, namely the XGBoost library.

Gradient boosting is a valid choice because it handles highly dimensional data well, because our region data is dummy encoded, it increases the number of features greatly.

In [5]:
model = xgb.XGBRegressor(learning_rate=0.1,n_estimators=1000)

model.fit(training_X, training_y)
result = model.predict(test_X)




In [6]:
# Creating the submission file, code from the example kernel.
result_df = pd.DataFrame(result)
result_df.to_csv("ml_comp_submission.csv", header=["AveragePrice"], index_label="ID")