In [1]:
import pandas as pd
import numpy as np

# READ THE CSV INTO DATAFRAME

df = pd.read_csv('Syngenta/Syngenta_2017/Experiment_dataset.csv')

np_ar = np.asarray(df)
# df2 = pd.read_csv('Syngenta/Syngenta_2017/Region_dataset.csv')

In [None]:
print(df.columns)
print(df.Temperature.describe())

In [None]:
# CURRENTLY NECESSARY IF: USING 174 ADDITIONAL VARIETY COLUMNS METHOD

# THIS IS A DIFFERENT APPROACH TO THE ABOVE FOUR CELLS, WHERE WE HAVE 174 ADDITIONAL FEATURE COLUMNS
# EACH WITH A 0 (IF IT IS NOT OF THAT VARIETY) OR A 1 (IF IT IS OF THAT VARIETY)

# print(df)
variety_dummies = pd.get_dummies(df.Variety)
# print(dummies)
df = pd.concat([df, variety_dummies], axis=1)


In [None]:
# OPTIONAL VARIETY DISTRIBUTION ANALYSIS

print(variety_dummies.sum().describe())
print(np.sort(variety_dummies.sum()))
for idx, cl in enumerate(variety_dummies.sum()):
    print(variety_dummies.columns[idx], cl)

In [None]:
# GOAL OF THIS MODULE:
# Encode the planting date as a season

# remove the dates that are "."
df = df[~df['Planting date'].str.match("\.")]
plant_date = df['Planting date'].apply(lambda dt: pd.to_datetime(dt))
plant_date = plant_date.rename("Season")
plant_date = pd.to_datetime(plant_date)
plant_date = plant_date.apply(lambda dt: (dt.month%12 + 3)//3)
# df['Plant date'] = pd.to_datetime(df['Plant date'])
df = pd.concat([df, plant_date], axis=1)

# plant_date = pd.to_datetime(df['Planting date'], infer_datetime_format=True)
# df = df['Planting date'].apply(lambda dt: (dt.month%12 + 3)//3)
# pd.get_dummies(df['Planting date'])


In [None]:
pd.get_dummies(df['Season']).sum()

In [None]:
print(df.columns)
print(df.isnull().sum())

In [2]:
# LATITUDE AND LONGITUDE CLUSTERING INTO FEATURES

from sklearn.cluster import KMeans

latlong = df.loc[:, ['Latitude', 'Longitude']]

kmeans = KMeans(n_clusters=4, random_state=0).fit(latlong)
kmeans.labels_.shape
lat_long_dummies = pd.get_dummies(kmeans.labels_).rename(index=int, columns={0: "Loc Clust 0", 1: "Loc Clust 1", 2: "Loc Clust 2", 3: "Loc Clust 3"})
df = pd.concat([df, lat_long_dummies], axis = 1)

In [None]:
np.asarray(df.iloc[:, df.columns.str.match('V\d\d\d\d\d\d')].columns)

In [3]:
#THIS IS A VISUALIZATION FOR LATITUDE AND LONGITUDE CLUSTERING

cent = kmeans.cluster_centers_
clust_labels = kmeans.labels_
means = pd.DataFrame(clust_labels)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(df['Latitude'],df['Longitude'],
                     c=means[0], s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('Latitude')
ax.set_ylabel('Longitude')
plt.colorbar(scatter)

<matplotlib.colorbar.Colorbar at 0x7fa3e52131d0>

In [None]:
#REMOVE ANY NAN VALUES

print(df.columns)
df = df[~df.Silt.isnull()]
df = df[~df['Loc Clust 1'].isnull()]

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
binarized = lb.fit(df.Variety)
print(binarized)
df.Variety = pd.Series(binarized.transform(df.Variety))
print(binarized.transform(df.Variety).reshape(1,-1))

In [None]:
for col in df.columns:
    print(col, type(df[col][0]))

In [4]:
# DROP ALL THE CELLS THAT ARE NOT USABLE SUCH AS THE ONES THAT ARE STRINGS OR DATES

# set if want to drop some columns specifically
should_drop = 1
columns_to_drop = ['Experiment', 'Location',
                   'Check Yield', 'Yield difference', 'Latitude',
                   'Longitude', 'Variety', 'PI', 'Planting date']

# set if want to keep some columns specifically
should_keep = 0
# columns_to_keep = ['Loc Clust 0', 'Loc Clust 1', 'Loc Clust 2', 'Loc Clust 3']
columns_to_keep_top = ['Silt', 'Precipitation', 'Temperature']
columns_VARIETIES_ONLY = np.asarray(df.iloc[:, df.columns.str.match('V\d\d\d\d\d\d')].columns)

#set the below variable to whatever columns you want to keep
columns_to_keep = columns_VARIETIES_ONLY

MUST_HAVE_COLUMNS = ['Yield']
# print(columns_to_keep)

df = df.drop(columns_to_drop, axis=1) if should_drop else df
df = df.loc[:, np.concatenate((columns_to_keep, MUST_HAVE_COLUMNS))] if should_keep else df
df['YieldBucket'] = pd.Series(pd.qcut(df.Yield, q=3, labels=["high", "medium", "low"]))
print(df.columns)

Index(['Yield', 'Year', 'Temperature', 'Precipitation', 'Solar Radiation',
       'Soil class', 'CEC', 'Organic matter', 'pH', 'Clay', 'Silt', 'Sand',
       'Area', 'Loc Clust 0', 'Loc Clust 1', 'Loc Clust 2', 'Loc Clust 3',
       'YieldBucket'],
      dtype='object')


In [None]:
print(pd.get_dummies(df.YieldBucket).sum())

In [5]:
# LET US ALSO MAKE SURE THERE ARE NO NAN IN THE DATA
print("We expect to be %s nan values and there actually are %s nan values\n" % (0, np.sum(df.isnull().sum())))
print(df.isnull().sum())
# AFTER COLUMNS, MAKE SURE NO SKETCHY ONES
for col in df.columns:
    print(col, type(df[col][0]))    

We expect to be 0 nan values and there actually are 0 nan values

Yield              0
Year               0
Temperature        0
Precipitation      0
Solar Radiation    0
Soil class         0
CEC                0
Organic matter     0
pH                 0
Clay               0
Silt               0
Sand               0
Area               0
Loc Clust 0        0
Loc Clust 1        0
Loc Clust 2        0
Loc Clust 3        0
YieldBucket        0
dtype: int64
Yield <class 'numpy.float64'>
Year <class 'numpy.int64'>
Temperature <class 'numpy.float64'>
Precipitation <class 'numpy.float64'>
Solar Radiation <class 'numpy.int64'>
Soil class <class 'numpy.int64'>
CEC <class 'numpy.float64'>
Organic matter <class 'numpy.float64'>
pH <class 'numpy.float64'>
Clay <class 'numpy.float64'>
Silt <class 'numpy.float64'>
Sand <class 'numpy.float64'>
Area <class 'numpy.float64'>
Loc Clust 0 <class 'numpy.uint8'>
Loc Clust 1 <class 'numpy.uint8'>
Loc Clust 2 <class 'numpy.uint8'>
Loc Clust 3 <class 'numpy.uin

In [6]:
# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT
# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT
# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT
# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT# TRAIN AND TEST SPLIT

X = df.drop(['Yield', 'YieldBucket'], axis=1)

print(X.columns)

y = df.Yield

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, train_size = 0.1, random_state = 42)

Index(['Year', 'Temperature', 'Precipitation', 'Solar Radiation', 'Soil class',
       'CEC', 'Organic matter', 'pH', 'Clay', 'Silt', 'Sand', 'Area',
       'Loc Clust 0', 'Loc Clust 1', 'Loc Clust 2', 'Loc Clust 3'],
      dtype='object')


In [None]:
print("X_train shape:", X_train.shape, "\ny_train shape:", y_train.shape)

In [9]:
# This function will evaluate the errors based on RMSE (from the challenge spec)
# also will evaluate based on average error

from sklearn.metrics import mean_squared_error
def evaluate_errors(prediction, actual):
    print("RMSE Error: ", np.sqrt(mean_squared_error(prediction, actual)))
    avg_error_vector = np.absolute(((preds - y_test) / y_test) * 100)
#     print("Average Error: ", np.mean(avg_error_vector))
    print("Average Error details:\n", avg_error_vector.describe())
    return avg_error_vector

In [11]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=20, max_depth=13, random_state=0, verbose=1)
regr.fit(X_train, y_train)
preds = regr.predict(X_test)

evaluate_errors(preds, y_test)


RMSE Error:  7.286046932916138
Average Error details:
 count    4102.000000
mean       10.214530
std        10.612184
min         0.000967
25%         3.562631
50%         7.515530
75%        13.377225
max       171.625417
Name: Yield, dtype: float64


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


In [None]:
pd.get_dummies(df['Plant date']).sum()

In [13]:
# GET OUTPUT OF FEATURE IMPORTANCE
def get_feature_importances(regr):
    feature_importances = regr.feature_importances_
    feature_importances = pd.Series(feature_importances)
    feature_importance_df = pd.DataFrame({'feature': X_train.columns,'feature_importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by=['feature_importance'])
    for index, row in feature_importance_df.iterrows():
        print(row['feature'], 'has importance: ', row['feature_importance'])
get_feature_importances(regr)

Loc Clust 1 has importance:  0.0019782225783457987
Loc Clust 0 has importance:  0.003120506992478384
Loc Clust 2 has importance:  0.005820109936884353
Loc Clust 3 has importance:  0.0058934581272339125
Soil class has importance:  0.01933195562288706
pH has importance:  0.040783306266072836
Sand has importance:  0.046619161650392996
CEC has importance:  0.06044121342806226
Clay has importance:  0.06190371098238797
Area has importance:  0.07691924104295542
Organic matter has importance:  0.08169355801487979
Solar Radiation has importance:  0.09554505237484413
Precipitation has importance:  0.1043630222426392
Year has importance:  0.10889841062908041
Temperature has importance:  0.11636072046801742
Silt has importance:  0.1703283496428381


In [None]:
# THIS WILL ONLY WORK WITH THE BUCKET METHOD

from sklearn.ensemble import RandomForestClassifier
regr = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=0, verbose=1)
regr.fit(X_train, y_train)
preds = regr.predict(X_test)

from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, preds))

In [7]:
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPRegressor

from sklearn.feature_selection import RFECV

classifiers = [
    svm.SVR(),
    MLPRegressor(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1),
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
#     linear_model.ARDRegression(),
#     linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression()]




# estimator = svm.SVR(kernel="linear")

# selector = RFECV(estimator, step=1, cv=5, verbose=1)
# selector = selector.fit(X_train, y_train)
# selector.support_ 
# # array([ True,  True,  True,  True,  True,
# #         False, False, False, False, False], dtype=bool)
# selector.ranking_
# # array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])


#     print(np.sum(preds - y_test))
#     print(clf.predict(X_test),'\n')
#     print(y_test)
#     print('accuracy score:', accuracy_score(y_test, clf.predict(X_test)), '\n')


In [18]:
for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    errors = evaluate_errors(preds, y_test)
    try:
        get_feature_importances(clf)
    except:
        print("NO FEATURE IMPORTANCE METRIC")
#     errors = np.absolute(((preds - y_test) / y_test) * 100)
#     print(errors)
    print(errors)
#     print(errors.describe())

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
RMSE Error:  8.20675363379203
Average Error details:
 count    4102.000000
mean       11.844229
std        13.748968
min         0.000433
25%         3.868332
50%         8.489203
75%        14.994232
max       257.847394
Name: Yield, dtype: float64
NO FEATURE IMPORTANCE METRIC
None
MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
RMSE Error:  11.498026353418481
Average Error details:
 count    4102.000000
mean       16.966659
std        17.865910
min         0.001149




RMSE Error:  11.032889432415264
Average Error details:
 count    4102.000000
mean       16.275976
std        17.994219
min         0.002458
25%         5.672874
50%        11.793014
75%        20.473766
max       359.117261
Name: Yield, dtype: float64
NO FEATURE IMPORTANCE METRIC
None
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
RMSE Error:  10.707322429761183
Average Error details:
 count    4102.000000
mean       15.690770
std        16.829560
min         0.000424
25%         5.703156
50%        11.933397
75%        19.902929
max       339.045027
Name: Yield, dtype: float64
NO FEATURE IMPORTANCE METRIC
None


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
#     KNeighborsClassifier(3),
#     SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]
from sklearn.metrics import accuracy_score
for item in classifiers:
    print(item)
    clf = item
    clf.fit(scale(X_train), y_train)
    preds = clf.predict(scale(X_test))
    print(accuracy_score(y_test, preds))
#     errors = np.absolute(((preds - y_test) / y_test) * 100)
#     print(errors)
#     print(np.mean(errors))