# Project Eden - Machine Learning Magic

## Import Dependencies

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline

In [None]:
file_path='model/production_clim_merged_georasters.csv'
data=pd.read_csv(file_path)

## Data Exploration

In [None]:
print(data.columns)
print(len(data.columns))

In [None]:
data.head()

The first 2 columns are X, Y.

The subsequent 19 columns are the features.

The remaining 42 columns are the various crops.

In [None]:
print('Number of NaNs in the dataframe:\n',data.isnull().sum())

In [None]:
plt.hist(data[data['bana'].isnull()]['x'])

In [None]:
plt.hist(data[data['bana'].notnull()]['x'])

In [None]:
plt.hist(data[data['bana'].isnull()]['y'])

In [None]:
plt.hist(data[data['bana'].notnull()]['y'])

In [None]:
print(plt.hist(data[data['coco'].isnull()]['x']))
print(plt.hist(data[data['coco'].notnull()]['x']))
print(plt.hist(data[data['coco'].isnull()]['y']))
print(plt.hist(data[data['coco'].notnull()]['y']))

In [None]:
print(plt.hist(data[data['soyb'].isnull()]['x']))
print(plt.hist(data[data['soyb'].notnull()]['x']))
print(plt.hist(data[data['soyb'].isnull()]['y']))
print(plt.hist(data[data['soyb'].notnull()]['y']))

In [None]:
print(plt.hist(data[data['03'].isnull()]['x']))
print(plt.hist(data[data['03'].notnull()]['x']))
print(plt.hist(data[data['03'].isnull()]['y']))
print(plt.hist(data[data['03'].notnull()]['y']))

## Data Manipulation

In [None]:
# This Cell is part of Data Manipulation and is essential for correct execution.
# We will drop feature 03 because some are NaN
del data['03']
data_no_nan= data[data['bana'].notnull()]
data_nan = data[data['bana'].isnull()]
print (data_no_nan.head())
# This Cell is part of Data Manipulation and is essential for correct execution.

In [None]:
print(data['acof'].isnull().sum())
print(len(data_nan.index))

Data has been split between points that have crop data and those that don't.

## Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

In [None]:
X = data_no_nan.iloc[:,:20] #features of our data, that is, non-NaN
crops = {column : data_no_nan[column] for column in data_no_nan.columns[20:]}

In [None]:
#start of iteration

In [None]:
# Train on Lasso Model
# In the future we can train on multiple different models in parallel and get the best one.
models = {}
for crop in crops:
    Y = crops[crop]
    x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
    model = Lasso()
    lassoreg = Lasso.fit(model, x_train, y_train)
    models[crop] = model

In [None]:
scores = {}
for crop in models:
    model = models[crop]
    scores[crop] = model.score(x_test, y_test)
for crop in scores:
    print ("{0} score: {1}".format(crop, str(scores[crop])))

In [None]:
# Here, we produce a dataframe that has the predictions for all crops for all locations
columns = ['x', 'y'] + [crop for crop in crops][:2]
df = pd.DataFrame([], columns=columns)

all_data_feature_vectors = []
x_values = []
y_values = []
for index, row in data.iterrows():
    feature_vector = row[:20]
    all_data_feature_vectors.append(feature_vector)
    x_values.append(feature_vector['x'])
    y_values.append(feature_vector['y'])

df['x'] = np.array(x_values)
df['y'] = np.array(y_values)

for crop in crops:
    crop_production = models[crop].predict(all_data_feature_vectors)
    df[crop] = np.array(crop_production)
    
df.head()