In [None]:
# reasons for this notebook:
# The main reason to have a dictionary approach like this is to have a dictionary for each
# variety that will have a classifier that is specific to that variety. In this case, 174 varieties.

In [None]:
# THERE ARE SEPARATE NOTEBOOKS FOR VISUALIZATIONS, DATASET ANALYSIS, ETC. IN THE REPO.

import pandas as pd
import numpy as np

# READ THE CSV INTO DATAFRAME

df = pd.read_csv('Syngenta/Syngenta_2017/Experiment_dataset.csv')

In [None]:
# GOAL OF THIS MODULE:
# Encode the planting date as a season

# remove the dates that are "."
df = df[~df['Planting date'].str.match("\.")]
plant_date = df['Planting date'].apply(lambda dt: pd.to_datetime(dt))
plant_months = plant_date.apply(lambda dt: dt.month)
season = plant_date.rename("Season")
season = pd.to_datetime(season)
season = season.apply(lambda dt: (dt.month%12 + 3)//3)
# df['Plant date'] = pd.to_datetime(df['Plant date'])
df = pd.concat([df, season], axis=1)

# plant_date = pd.to_datetime(df['Planting date'], infer_datetime_format=True)
# df = df['Planting date'].apply(lambda dt: (dt.month%12 + 3)//3)
# pd.get_dummies(df['Planting date'])

In [None]:
# ADD MONTH OF MAY AND JUNE ONE HOT ENCODING INTO THE DATAFRAME
pd.get_dummies(plant_months).sum()
june = pd.get_dummies(plant_months).loc[:,6]
june = june.rename("June")
may = pd.get_dummies(plant_months).loc[:,5]
may = may.rename("May")
df = pd.concat([df, may], axis=1)
df = pd.concat([df, june], axis=1)

In [None]:
# LATITUDE AND LONGITUDE CLUSTERING INTO FEATURES

from sklearn.cluster import KMeans

latlong = df.loc[:, ['Latitude', 'Longitude']]

kmeans = KMeans(n_clusters=4, random_state=0, n_jobs=-1).fit(latlong)
kmeans.labels_.shape
lat_long_dummies = pd.get_dummies(kmeans.labels_)
lat_long_dummies = lat_long_dummies.rename(index=int, columns={0: "Loc Clust 0",
                                                               1: "Loc Clust 1",
                                                               2: "Loc Clust 2",
                                                               3: "Loc Clust 3"})
df = pd.concat([df, lat_long_dummies], axis = 1)

In [None]:
#REMOVE ANY NAN VALUES

print(df.columns)
df = df[~df.Silt.isnull()]
df = df[~df['Loc Clust 1'].isnull()]

In [None]:
# DROP ALL THE CELLS THAT ARE NOT USABLE SUCH AS THE ONES THAT ARE STRINGS OR DATES

# set if want to drop some columns specifically
should_drop = 1
# columns_to_drop = ['Experiment', 'Location',
#                    'Check Yield', 'Yield difference', 'Latitude',
#                    'Longitude', 'PI', 'Variety', 'Planting date', 'Season']

# BELOW DROP IS USED FOR THE DF_DICT APPROACH
columns_to_drop = ['Experiment', 'Location',
                   'Check Yield', 'Yield difference', 'PI', 'Planting date', 'Season']

# set if want to keep some columns specifically
should_keep = 0
# columns_to_keep = ['Loc Clust 0', 'Loc Clust 1', 'Loc Clust 2', 'Loc Clust 3']
columns_to_keep_top = ['Silt', 'Precipitation', 'Temperature', 'Solar Radiation', 'Organic matter']
# columns_VARIETIES_ONLY = np.asarray(df.iloc[:, df.columns.str.match('V\d\d\d\d\d\d')].columns)

#set the below variable to whatever columns you want to keep
columns_to_keep = columns_to_keep_top

MUST_HAVE_COLUMNS = ['Yield']
# print(columns_to_keep)

df = df.drop(columns_to_drop, axis=1) if should_drop else df
df = df.loc[:, np.concatenate((columns_to_keep, MUST_HAVE_COLUMNS))] if should_keep else df
df['YieldBucket'] = pd.Series(pd.qcut(df.Yield, q=3, labels=["high", "medium", "low"]))
print("The final dataframe has columns: ", df.columns)

In [None]:
# LET US ALSO MAKE SURE THERE ARE NO NAN IN THE DATA

print("We expect to be %s nan values and there actually are %s nan values\n" % (0, np.sum(df.isnull().sum())))
print(df.isnull().sum())
# AFTER COLUMNS, MAKE SURE NO SKETCHY ONES
for col in df.columns:
    print(col, type(df[col][0]))    

In [None]:
# GOAL OF THIS MODULE:
# CREATE A DICTIONARY OF DATAFRAMES CONTAINING EACH VARIETY AND CORRESPONDING ROWS
# MOST LIKELY SHOULD DO THIS AFTER ADDING AND REMOVING THE DESIRED COLUMNS

UNIQUE_VARIETIES = np.unique(df.Variety)
df_dict = {variety: df.loc[df.Variety == variety] for variety in UNIQUE_VARIETIES}


In [None]:
df_dict['V000016']

In [None]:
# GOAL OF THIS MODULE
# ALONG WITH THE PREVIOUS MODULE, CREATE DICTIONARIES FOR EACH OF THE TRAINING AND TEST SETS, SO 4 DICTS TOTAL
# WHICH EACH CONTAIN A DISTINCT TRAINING AND TEST SET FOR EACH VARIETY

from sklearn.model_selection import train_test_split

def cross_validation_split(data):
    
    
    if type(data) == dict:
        X_train_dict = {}
        X_test_dict = {}
        y_train_dict = {}
        y_test_dict = {}
        for variety, dataf in data.items():
            X = dataf.drop(['Yield', 'YieldBucket', 'Variety'], axis=1)
            y = dataf.Yield
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, train_size = 0.95,
                                                                random_state = 42)
            
            X_train_dict[variety] = X_train
            X_test_dict[variety] = X_test
            y_train_dict[variety] = y_train
            y_test_dict[variety] = y_test
            
        return X_train_dict, X_test_dict, y_train_dict, y_test_dict
#     else:
#         # ADD BELOW MODULE HERE
            
            
X_train_dict, X_test_dict, y_train_dict, y_test_dict = cross_validation_split(df_dict)

In [None]:
before = np.array([])
beforev = np.array([])
for key, value in X_test_dict.items():
    before = np.append(before, value.shape[0])
    beforev = np.append(beforev, key)
    
pd.DataFrame({
    "variety": beforev,
    "count": before
}).sort_values(by="count")

In [None]:
X_train_dict['V000016']

In [None]:
# THIS FUNCTION WILL EVALUATE ERRORS BASED ON RMSE (FROM SYNGENTA CHALLENGE SPEC)
# AND ALSO WILL EVALUATE BASED ON AVERAGE ERROR

from sklearn.metrics import mean_squared_error

def evaluate_errors(prediction, actual):
    RMSE_error = np.sqrt(mean_squared_error(prediction, actual))
    print("RMSE Error: ", np.sqrt(mean_squared_error(prediction, actual)))
    avg_error_vector = np.absolute(((prediction - actual) / actual) * 100)
    print("Average Error details:\n", avg_error_vector.describe())
    return avg_error_vector, RMSE_error

In [None]:
# GET OUTPUT OF FEATURE IMPORTANCE

def get_feature_importances(regr):
    feature_importances = regr.feature_importances_
    feature_importances = pd.Series(feature_importances)
    feature_importance_df = pd.DataFrame({'feature': X_train.columns,'feature_importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by=['feature_importance'])
    for index, row in feature_importance_df.iterrows():
        print(row['feature'], 'has importance: ', row['feature_importance'])


In [None]:
# THIS IS TO BE USED WITH THE DICTIONARY APPROACH ONLY

from sklearn.base import clone

def train_on_varieties(clf, X_train_dict_, y_train_dict_):
    clfs = {}
    for variety, dataf in X_train_dict_.items():
        clf = clone(clf)
        clf.fit(X_train_dict_[variety], y_train_dict[variety])
        clfs[variety] = clf
    return clfs

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=, max_depth=13, random_state=0, verbose=1, n_jobs=-1)

clfs = train_on_varieties(clf, X_train_dict, y_train_dict)

In [None]:
# THIS IS USED FOR TESTING WITH THE DICTIONARY APPROACH

def test_on_varieties(classifier_dict, X_test_dict_, y_test_dict_):
    preds = {}
    accuracies = {}
    for variety, classifier in classifier_dict.items():
        preds[variety] = classifier.predict(X_test_dict_[variety])
        accuracies[variety] = evaluate_errors(preds[variety], y_test_dict_[variety])
    return preds, accuracies
        
preds, accuracies = test_on_varieties(clfs, X_test_dict, y_test_dict)

In [None]:
# THIS IS SOME ANALYSIS OF THE PREDICTIONS OF THE DICTIONARY APPROACH

# for var, accs in accuracies.items():
#     print(accs)
# #     print(var, accs[0].describe())
#     all_means = np.array([])
#     np.append(all_means, accs[0].describe()['mean'])
# #     print(accs[0].describe()['mean'])
#     print(all_means.shape)
all_means = np.array([])
all_RMSE = np.array([])
varieties = np.array([])

for variety, accuracy_tuple in accuracies.items():
    all_means = np.append(all_means, accuracy_tuple[0].describe()['mean'])
    all_RMSE = np.append(all_RMSE, accuracy_tuple[1])
    varieties = np.append(varieties,variety)
# accuracies['V000016'][0].describe()['mean']
# print(all_means.shape)
import pprint
from scipy.stats import describe
pprint.pprint(describe(all_means))
np.mean(all_means)
# print(all_means)
print(np.mean(all_RMSE))

In [None]:
varieties

In [None]:
RMSE_df = pd.DataFrame(all_RMSE)
varieties_df = pd.DataFrame(varieties)
# numbers_varieties_df = pd.DataFrame([df_dict[var].shape[0] for var in varieties_df])
numbers_varieties_df = pd.DataFrame([df_dict[var].shape[0] for idx,var in varieties_df[0].items()])
RMSE_df = pd.concat([RMSE_df, varieties_df], axis=1)
RMSE_df = pd.concat([RMSE_df, numbers_varieties_df], axis=1)

In [None]:
RMSEP_df = pd.DataFrame(RMSE_df.RMSE / pd.DataFrame([df_dict[var]]))

In [None]:
RMSE_df = RMSE_df.loc[:, ["RMSE", "VAR", "COUNT"]]

In [None]:
mean_yields_df = pd.DataFrame([df_dict[var].Yield.describe()['mean'] for idx,var in varieties_df[0].items()])
RMSE_df = pd.concat([RMSE_df, RMSE_df.RMSE / mean_yields_df[0]], axis=1)

In [None]:
RMSE_df.sort_values(by=0).RMSE.describe()

In [None]:
RMSE_df.columns = ['RMSE', 'VAR', 'COUNT']

In [None]:
RMSE_df = RMSE_df.sort_values(by=['RMSE'])
RMSE_df
np.corrcoef(RMSE_df.RMSE, RMSE_df.COUNT)

In [None]:
for idx, var in RMSE_df.iterrows():
    print(var)

In [None]:
RMSE_df.sort_values(by="RMSE")