In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
dataset = pd.read_csv('data.csv')
dataset=dataset.drop(columns=['CustomerID'])
dataset['InvoiceDate'] = pd.to_datetime(dataset['InvoiceDate'])
dataset= dataset.set_index('InvoiceDate')
dataset['Description'] = dataset['Description'].fillna('UNKNOWN ITEM')
dataset= dataset[~dataset['InvoiceNo'].str.startswith('C')]
dataset = dataset[dataset['Quantity'] > 0]
dataset=dataset[dataset['UnitPrice']>0]
dataset['Sales']=dataset['Quantity']*dataset['UnitPrice']
dataset_ts = dataset[['Sales']]
dataset_clean = dataset[dataset['UnitPrice'] < 15]
dataset_join = dataset_clean.groupby('InvoiceNo')[['Quantity']].sum()
dataset_join = dataset_join.reset_index()
dataset_clean['InvoiceDate'] = dataset_clean.index
dataset_clean = dataset_clean.merge(dataset_join, how='left', on='InvoiceNo')
dataset_clean = dataset_clean.rename(columns={'Quantity_x' : 'Quantity', 'Quantity_y' : 'QuantityInv'})
dataset_clean['InvoiceDate'] = pd.to_datetime(dataset_clean['InvoiceDate'])
bins_q = pd.IntervalIndex.from_tuples([(0, 2), (2, 5), (5, 8), (8, 11), (11, 14), (15, 5000)])
dataset_clean['QuantityRange'] = pd.cut(dataset_clean['Quantity'], bins=bins_q)
bins_p = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 3), (3, 4), (4, 20)])
dataset_clean['PriceRange'] = pd.cut(dataset_clean['UnitPrice'], bins=bins_p)
dataset_clean['Month'] = dataset_clean['InvoiceDate'].dt.month
bins_d = pd.IntervalIndex.from_tuples([(0,3),(3,6),(6,9),(9,12)])
dataset_clean['DateRange'] = pd.cut(dataset_clean['Month'], bins=bins_d, labels=['q1','q2','q3','q4'])
dataset_uk = dataset_clean[dataset_clean['Country'] == 'United Kingdom']
dataset_abroad = dataset_clean[dataset_clean['Country'] != 'United Kingdom']
dataset_uk_model = dataset_uk[['Sales', 'QuantityInv', 'QuantityRange', 'PriceRange', 'DateRange']]
dataset_data = dataset_uk_model.copy()
dataset_data = pd.get_dummies(dataset_data, columns=['QuantityRange'], prefix='qr')
dataset_data = pd.get_dummies(dataset_data, columns=['PriceRange'], prefix='pr')
dataset_data = pd.get_dummies(dataset_data, columns=['DateRange'], prefix='dr')
dataset_data['QuantityInv'] = scale(dataset_data['QuantityInv'])
y = dataset_data['Sales']
X = dataset_data.drop(columns=['Sales'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)
import xgboost as xg
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Linear Regression
fit_intercepts = [True, False]
param_grid_linear = dict(fit_intercept=fit_intercepts)
linear_model = LinearRegression()

#Xgboost Regression

xgb_r = xg.XGBRegressor()
param_grid_boost=dict(n_estimators =[10] , seed = [123])
boost_model=xg.XGBRegressor()
  
cv = 5

models_to_test = ['LinearRegression','XgboostRegression']
regression_dict = dict(LinearRegression=linear_model,XgboostRegression=boost_model)
param_grid_dict = dict(LinearRegression=param_grid_linear,XgboostRegression=param_grid_boost)

score_dict = {}
params_dict = {}
mae_dict = {}
mse_dict = {}
r2_dict = {}
best_est_dict = {}

for model in models_to_test:
  regressor = GridSearchCV(regression_dict[model], param_grid_dict[model], cv=cv, n_jobs=-1)
  if model=='XgboostRegression':
        X_train.columns = X_train.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))
        #y_train.columns = y_train.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))
        X_test.columns = X_test.columns.str.translate("".maketrans({"[":"{", "]":"}","<":"^"}))


  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)

  # Print the tuned parameters and score
  print(" === Start report for regressor ",model,"===")
  score_dict[model] = regressor.best_score_
  print("Tuned Parameters: ",regressor.best_params_) 
  params_dict = regressor.best_params_
  print("Best score is",regressor.best_score_)

  # Compute metrics
  mae_dict[model] = mean_absolute_error(y_test, y_pred)
  print("MAE for",model)
  print(mean_absolute_error(y_test, y_pred))
  mse_dict[model] = mean_squared_error(y_test, y_pred)
  print("MSE for",model)
  print(mean_squared_error(y_test, y_pred))
  r2_dict[model] = r2_score(y_test, y_pred)
  print("R2 score for",model)
  print(r2_score(y_test, y_pred))
  print(" === End of report for regressor",model," === \n")
  
  # Add best estimator to the dict
  best_est_dict[model] = regressor.best_estimator_

  #print(regressor.best_estimator_)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_clean['InvoiceDate'] = dataset_clean.index


 === Start report for regressor  LinearRegression ===
Tuned Parameters:  {'fit_intercept': True}
Best score is -1.074166996892226
MAE for LinearRegression
47.39708894596153
MSE for LinearRegression
8643.236817390849
R2 score for LinearRegression
-0.9218835328859833
 === End of report for regressor LinearRegression  === 

 === Start report for regressor  XgboostRegression ===
Tuned Parameters:  {'n_estimators': 10, 'seed': 123}
Best score is 0.44711379328021905
MAE for XgboostRegression
7.7241828687859435
MSE for XgboostRegression
2092.7129794287675
R2 score for XgboostRegression
0.5346707837359679
 === End of report for regressor XgboostRegression  === 

