In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

test_zf = zipfile.ZipFile('data//test.csv.zip')
train_zf = zipfile.ZipFile('data//train.csv.zip')

train = pd.read_csv(train_zf.open('train.csv'))
test = pd.read_csv(test_zf.open('test.csv'))

train['train'] = 1
test['train'] = 0


In [None]:
print(train.describe())
print(train.info())


In [None]:
corr = train.corr()
print(corr)

In [None]:
vars = ['fruitset', 'fruitmass', 'seeds']
fig = plt.figure(figsize = (12, 8))
for i, var in enumerate(vars):
    plt.subplot(2,3,i+1)
    plt.grid(b=True, axis='y')

    plt.title(f"{var} vs yield", size=18, y=1.03, fontname='Calibri', 
                fontweight='bold', color='#444444')
    ax1 = sns.scatterplot(data = train, x=var, y='yield', hue="clonesize", alpha= 0.5)
    ax1.set(ylim=(0, 10000))
    plt.ylabel('Yield')
    plt.xlabel(var)
    plt.xticks(fontname='Calibri', size=12)
    plt.yticks(fontname='Calibri', size=12)
    for s in ['right', 'top', 'left', 'bottom']:
        if s == "bottom":
            ax1.spines[s].set_linewidth(2)
        else:
            ax1.spines[s].set_visible(False)

    fig.tight_layout(pad=3)

    plt.subplot(2,3,i+4)
    plt.grid(b=True, axis='y')

    plt.title(f"{var} distribution", size=18, y=1.03, fontname='Calibri', 
                fontweight='bold', color='#444444')
    ax2 = sns.histplot(data = train, x=var, color='#DB97D5')
    ax2.set(ylim=(0, 1000))
    plt.ylabel('Count')
    plt.xlabel(var)
    plt.xticks(fontname='Calibri', size=12)
    plt.yticks(fontname='Calibri', size=12)
    for s in ['right', 'top', 'left', 'bottom']:
        if s == "bottom":
            ax2.spines[s].set_linewidth(2)
        else:
            ax2.spines[s].set_visible(False)

    fig.tight_layout(pad=3)

plt.show()

In [None]:
vars = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'RainingDays', 'AverageRainingDays']
fig = plt.figure(figsize = (12, 12))
for i, var in enumerate(vars):
    plt.subplot(4,4,i+1)
    plt.grid(b=True, axis='y')

    plt.title(f"{var} vs yield", size=18, y=1.03, fontname='Calibri', 
                fontweight='bold', color='#444444')
    ax1 = sns.scatterplot(data = train, x=var, y='yield', alpha= 0.5,color='#DB97D5')
    ax1.set(ylim=(0, 10000))
    plt.ylabel('Yield')
    plt.xlabel(var)
    plt.xticks(fontname='Calibri', size=12)
    plt.yticks(fontname='Calibri', size=12)
    for s in ['right', 'top', 'left', 'bottom']:
        if s == "bottom":
            ax1.spines[s].set_linewidth(2)
        else:
            ax1.spines[s].set_visible(False)

    fig.tight_layout(pad=3)

plt.show()

Temperatures showed low correlations with yield (see correlation matrix). I came to the conclusion that none these features can be removed from the data set. Some of the variables were binned using k-means clustering. Outlier were filtered out.

In [None]:

train = train[train['honeybee'] < 4]
train = train[train['bumbles'] < 0.5]
train = train[train['andrena'] > 0.2]

combined = pd.concat([train, test])
combined['avg_bees'] = (combined['bumbles'] + combined['andrena'] + combined['osmia']) 
combined.drop(combined.columns[combined.columns.str.contains('TRange')],
            axis=1, inplace=True)
combined = combined.drop('RainingDays', axis=1)

helper_dict = {'clonesize' : 4,
               'honeybee' : 3,
               'bumbles' : 3,
               'andrena' : 5,
               'osmia' : 6,
               'AverageRainingDays' : 4}


for key, value in helper_dict.items():
    km = KMeans(n_clusters=value, n_init=10).fit(train[[key]])
    df = pd.DataFrame(list(zip(range(len(km.cluster_centers_)), km.cluster_centers_)),
                  columns=['labels', 'centers'])
    df = df.sort_values(['centers'])
    new_column = key + "_labels"
    combined[new_column] = km.predict(combined[[key]])
    combined[new_column] = combined[new_column].map(dict(zip(df['labels'].to_list(),
                                                              range(len(df.index)))))
    combined = combined.drop(key, axis=1)

combined = combined.drop(columns=['id'])
X = combined[combined['train'] == 1]
X.pop('train')
Y = X.pop('yield')

Hyperpareter tuning 

In [None]:
# params = {'boosting_type': 'gbdt', 'max_depth': -1, 'objective': 'regression', 
#               'num_leaves': 64, 'learning_rate': 0.05, 'max_bin': 512, 
#               'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1,
#               'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 10, 
#               'min_split_gain': 0.5, 'min_child_weight': 1, 
#               'min_child_samples': 5, 'scale_pos_weight': 1, 'num_class': 1, 
#               'metric': 'mae'}

# grid_params = {'learning_rate': [0.01], 'n_estimators': [8, 24],
#                 'num_leaves': [6, 8, 12, 16], 'boosting_type': ['gbdt'], 
#                 'objective': ['regression'], 'seed': [500],
#                 'colsample_bytree': [0.65, 0.75, 0.8], 
#                 'subsample': [0.7, 0.75], 'reg_alpha': [1, 2, 6],
#                 'reg_lambda': [1, 2, 6]}


# mod = lgb.LGBMClassifier(**params)
# print(mod.get_params().keys())
# grid = GridSearchCV(mod, param_grid=grid_params, verbose=1, cv=5, n_jobs=-1)
# grid.fit(X, Y)

# best_params = {k: grid.best_params_.get(k, v) for k, v in params.items()}
# best_params['verbosity'] = -1

Optimal model fitting

In [None]:
folds = 5
kf = KFold(n_splits = folds, shuffle = True, random_state = 43)
kf.get_n_splits(X)
mae = 0

for train_index, valid_index in kf.split(X):
    
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

    best_params = {
        'objective': 'mae', 
        'num_leaves': 400,  
        'min_child_weight': 14, 
        'max_depth': 7, 
        'learning_rate': 0.05, 
        'force_col_wise': True, 
        'colsample_bytree': 0.9,
        'verbose': 0
    }

    gbm = lgb.train(best_params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=lgb_valid,
                    callbacks=[lgb.early_stopping(stopping_rounds=5)])
    
    y_pred = gbm.predict(x_valid, num_iteration=gbm.best_iteration)
    mae += mean_absolute_error(y_valid, y_pred)

print(mae/folds)