# Task for Today  

***

## Coffee Production Prediction  

Given *data about coffee*, let's try to predict the **average production** of coffee in a given country.

We will use a random forest regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

In [None]:
df_paths = [
    '../input/ico-coffee-dataset-worldwide/domestic-consumption.csv',
    '../input/ico-coffee-dataset-worldwide/exports-calendar-year.csv',
    '../input/ico-coffee-dataset-worldwide/exports-crop-year.csv',
    '../input/ico-coffee-dataset-worldwide/gross-opening-stocks.csv',
    '../input/ico-coffee-dataset-worldwide/total-production.csv'
]

In [None]:
dfs = [pd.read_csv(df_path) for df_path in df_paths]

In [None]:
def get_means(df):
    df = df.copy()
    countries = df[df.columns[0]]
    means = df.mean(axis=1)
    df = pd.concat([countries, means], axis=1)
    df.columns = ['country', countries.name]
    return df

In [None]:
def make_df(dfs):
    
    # Process all DataFrames
    processed_dfs = []
    
    for df in dfs:
        processed_dfs.append(get_means(df))
        
    # Merge DataFrames
    df = processed_dfs[0]
    
    for i in range(1, len(processed_dfs)):
        df = df.merge(processed_dfs[i], on='country')
    
    return df

In [None]:
data = make_df(dfs)
data

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop country column
    df = df.drop('country', axis=1)
    
    # Split df into X and y
    y = df['total_production']
    X = df.drop('total_production', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training/Results

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("Model trained.")

In [None]:
y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("RMSE: {:.2f}".format(rmse))

r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
print("R^2: {:.5f}".format(r2))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/nn5Z-qpCeEE