Задание к лабораторной работе:
1. Выберите набор данных для проведения регрессионного анализа -  
   https://www.kaggle.com/neuromusic/avocado-prices
2. Проведите регрессионный анализ данных из выбранного набора.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# загрузить данные в data

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2)
%matplotlib inline
df = pd.read_csv('../input/avocado-prices/avocado.csv', encoding='utf-8', index_col=False, parse_dates=[0])
df.head(5)

Некоторые колонки в данном наборе данных:
* Date - The date of the observation
* AveragePrice - the average price of a single avocado
* type - conventional or organic
* year - the year
* Region - the city or region of the observation
* Total Volume - Total number of avocados sold
* 4046 - Total number of avocados with PLU 4046 sold
* 4225 - Total number of avocados with PLU 4225 sold
* 4770 - Total number of avocados with PLU 4770 sold

PLU (Price Look Up) код — это код для идентификации свежих фруктов, овощей и зелени.

In [None]:
print(df.columns)

In [None]:
type(df["Total Volume"][0])

In [None]:
#df["RM"] = pd.to_numeric(df["RM"])

Построим графики взаимного распределения совместных величин

In [None]:
g=sns.pairplot(df[['AveragePrice','4046','4225','4770','Total Volume']])

График распределения величин. На пересечении - график совместных распределений. На диагонали -  гистограммы.

In [None]:
plt.figure(figsize=(20,5))
for i, col in enumerate(['AveragePrice','4046','4225','4770']):
    plt.subplot(1,4,i+1)
    x=df[col]
    y=df["Total Volume"]
    plt.plot(x,y,'o')
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x,y,1))(np.unique(x)),color='r')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('Total Volume')

Разделяем выборку

In [None]:
from sklearn.model_selection import train_test_split
y=df['Total Volume']
x=df.drop('Total Volume',axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=0)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn import  linear_model, metrics
regr=linear_model.SGDRegressor(random_state=42)
regr.fit(x_train,y_train)
y_pred=regr.predict(x_test)

print(metrics.mean_absolute_error(y_test,y_pred))

In [None]:

print(y_test[0:10].tolist())
print(list(map(lambda x:"{:2e}".format(x),y_pred[0:10])))

# %% [code]
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train,y_train)
scaled_train_data=scaler.transform(x_train)
scaled_test_data=scaler.transform(x_test)

# %% [code]
regr.fit(scaled_train_data,y_train)
y_scaled_pred=regr.predict(scaled_test_data)
print(metrics.mean_absolute_error(y_test,y_scaled_pred))

# %% [code]
print(y_test.values[:5])
print(y_scaled_pred[:5])

# %% [code]
import matplotlib.pyplot as plt
y_pred=regr.predict(x_test)
plt.scatter(y_test,y_scaled_pred,color="m")
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")

# %% [code]

from sklearn.pipeline import Pipeline

pipe=Pipeline(steps=[('scaling',scaler),('regression',regr)])
pipe.fit(x_train,y_train)
print(metrics.mean_absolute_error(y_test,pipe.predict(x_test)))

# %% [code]
pipe.get_params().keys()

# %% [code]
from sklearn.model_selection import GridSearchCV

parameters_grid={
    'regression__loss':['huber','squared_loss',],
    'regression__n_iter_no_change':[ 5,10,20],
    'regression__penalty':['l1','l2','none'],
    'regression__alpha':[0.0001,0.001,0.1],
}
grid_cv=GridSearchCV(pipe,parameters_grid,scoring='neg_mean_absolute_error',cv=4)

grid_cv.fit(x_train, y_train)

# %% [code]
print(-1*grid_cv.best_score_)
print(grid_cv.best_params_)

# %% [code]
plt.scatter(y_test,pipe.predict(x_test),color="b",alpha=0.6)
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")

# %% [code]
plt.scatter(y_test,pipe.predict(x_test),color="b",alpha=0.6)
plt.scatter(y_test,grid_cv.best_estimator_.predict(x_test),color="r",alpha=0.6)
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")
plt.show()

# %% [code]
x_corr=x_train.corr(method="pearson")
mask=np.zeros_like(x_corr)
mask[np.triu_indices_from(mask)]=True
fig,ax=plt.subplots(figsize=(10,8))
fig=sns.heatmap(x_corr,cmap="RdYlGn_r",mask=mask)

plt.show()

# %% [code]
x_train_dropped=x_train.drop('PTRATIO',axis=1)
x_test_dropped=x_test.drop('PTRATIO',axis=1)
grid_cv_drpd=grid_cv
grid_cv_drpd.fit(x_train_dropped,y_train)
print(-1*grid_cv_drpd.best_score_)
print(grid_cv_drpd.best_params_)

# %% [code]
plt.scatter(y_test,pipe.predict(x_test),color="b",alpha=0.6)
plt.scatter(y_test,grid_cv.best_estimator_.predict(x_test_dropped),color="r",alpha=0.6)
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")
plt.show()

# %% [markdown]
# А теперь возьмем другую модель для обучения.

# %% [code]
from sklearn.ensemble import RandomForestRegressor

rfr=RandomForestRegressor(random_state=42)
rfr.fit(x_train,y_train)
rf_predict=rfr.predict(x_test)
print(metrics.mean_absolute_error(y_test,rf_predict))
grid_cv=GridSearchCV(pipe,parameters_grid,scoring='neg_mean_absolute_error',cv=4)
grid_cv.fit(x_train,y_train)
plt.scatter(y_test,rf_predict,color="r",alpha=0.6)
