In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_predict

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Exploring Data**

Next, reading data in from avocados and visualizing it.

In [None]:
raw_data = pd.read_csv('../input/avocado.csv', encoding='utf-8', index_col='Date')

In [None]:
raw_data.head()

In [None]:
n_regions = raw_data.region.nunique()
regions = raw_data.region.unique()
print(n_regions, regions, sep="\n")

In [None]:
#x = regions[0]
place = 'Albany'
region = raw_data[raw_data.region == place]
region = region.sort_index()
price = region.AveragePrice


In [None]:
x= range(0,len(region.index))
y=price
plt.plot(x,y)
plt.show()

It seems that average price has varied a lot in Albany during nearly the first 3 years. After that it has become more easily trendable and therefor more predictable.  

**Testing Models**

Next, I divide data into training and testing sets. Training and testing of models follow.  

In [None]:
X = np.array(x)
X = X.reshape(-1,1)
X_train, y_train, X_test, y_test =  X[:150], y[:150], X[150:], y[150:] #division of training and test sets

In [None]:
models = [LinearRegression(), HuberRegressor(), Ridge()]
results = []
for model in models:
    predicted = cross_val_predict(model, X_train, y_train, cv= 5)
    testing = {'model':model,'cross_validation_method_1':metrics.mean_squared_error(y_pred=predicted, y_true=y_train)}
    model.fit(X_train, y_train)
    testing['singular_testing_method'] = metrics.mean_squared_error((model.predict(X_test)),y_test)
    testing['cross_validation_method_2'] = metrics.mean_squared_error(y_pred=cross_val_predict(model, X_test, y_test, cv= 5), y_true=y_test)
    results.append(testing)
    
for result in results:
    print("Model: {0},\n cross_validation_method_1: {1},\n singular_testing_method:{2},\n cross_validation_method_2: {3} \n".format(result['model'],result['cross_validation_method_1'],result['singular_testing_method'],result['cross_validation_method_2']))
    

**Discussion**

In this notebook, I practiced basics of training models for value prediction. Dataset was avocado.csv. I tried to predict the AveragePrice of the Albany region based on just time series data.

1.  cross_validation_method_1 used X_train and y_train to predict values. 
2. 'singular_testing_method' used X_train and y_train to learn and X_test values to predict.
3. cross_validation_method_2  used X_test and y_test for predictions.

It seems that all methods of testing are giving very minimal mean squared error. Cross validation is larger, however. I'd say this is not a problem, because it must be that cross-validation method gives a more general answer compared to singular testing method.  I am/was a little baffled since I could not use cross_val_predict in the same way as model.predict. Therefor, I'd argue results of cross_validation_method_1 are not comparable with the other two methods and this is what leads to its mean squared error being noticeably big. More simply put; cross_validation_method_1 was not tested on the same data as singular_testing_method and cross_validation_method_2 and that's why there's a gap in prediction error.
