In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Read the Data

In [None]:
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
sample_submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
train_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test_data = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

# 2. Merge the total features to train_data / test_data

## 2-1. Oil preprocessing and merge to train_data / test_data

In [None]:
oil = oil.fillna(method = 'pad')
oil = oil.fillna(method = 'bfill')
oil.set_index('date', inplace = True)

In [None]:
train_data_oil = pd.merge(train_data, oil, on = "date", how = 'left')
train_data_oil = train_data_oil.fillna(method = 'pad')

test_data_oil = pd.merge(test_data, oil, on = "date", how = 'left')
test_data_oil = test_data_oil.fillna(method = 'pad')

## 2-2. Holiday_events preprocessing and merge to train_data / test_data

In [None]:
train_data_oil_holiday = pd.merge(train_data_oil, holidays_events, on = "date", how = 'left')
train_data_oil_holiday = train_data_oil_holiday.fillna('Empty')

test_data_oil_holiday = pd.merge(test_data_oil, holidays_events, on = "date", how = 'left')
test_data_oil_holiday = test_data_oil_holiday.fillna('Empty')

## 2-3. Transactions preprocessing and merge to train_data / test_data

In [None]:
train_data_oil_holiday_transactions = pd.merge(train_data_oil_holiday, transactions, on = ['date', 'store_nbr'], how = 'left')
train_data_oil_holiday_transactions['transactions'] = train_data_oil_holiday_transactions['transactions'].fillna(0)

test_data_oil_holiday_transactions = pd.merge(test_data_oil_holiday, transactions, on = ['date', 'store_nbr'], how = 'left')
test_data_oil_holiday_transactions['transactions'] = test_data_oil_holiday_transactions['transactions'].fillna(0)

## 2-4. Store preprocessing and merge to train_data / test_data

In [None]:
train_data_oil_holiday_transactions = pd.merge(train_data_oil_holiday, stores, on = 'store_nbr', how = 'left')

test_data_oil_holiday_transactions = pd.merge(test_data_oil_holiday, stores, on =  'store_nbr', how = 'left')


## 2-5. Year, Month, Day, weekend, weekdays columns merge to train_data / test_data

In [None]:
def split_year(time):
  return int(time.split('-')[0])

def split_month(time):
  return int(time.split('-')[1])

def split_day(time):
  return int(time.split('-')[2])

def weekend(date):
  import datetime
  weekend = []
  a = pd.to_datetime(date)
  for i in range(len(a)):
    if a.iloc[i].weekday() >= 5 :
      weekend.append(1)
    else:
      weekend.append(0)
  return weekend
#Weekday
def weekday(date):
    import datetime
    weekday = []
    a = pd.to_datetime(date)
    for i in range(len(a)):
        weekday.append(a.iloc[i].weekday())
    return weekday

train_data_oil_holiday_transactions['Year'] = train_data_oil_holiday_transactions['date'].apply(split_year)
train_data_oil_holiday_transactions['Month'] = train_data_oil_holiday_transactions['date'].apply(split_month)
train_data_oil_holiday_transactions['Day'] = train_data_oil_holiday_transactions['date'].apply(split_day)
train_data_oil_holiday_transactions['Weekend'] = weekend(train_data_oil_holiday_transactions['date'])
train_data_oil_holiday_transactions['Weekday'] = weekday(train_data_oil_holiday_transactions['date'])

test_data_oil_holiday_transactions['Year'] = test_data_oil_holiday_transactions['date'].apply(split_year)
test_data_oil_holiday_transactions['Month'] = test_data_oil_holiday_transactions['date'].apply(split_month)
test_data_oil_holiday_transactions['Day'] = test_data_oil_holiday_transactions['date'].apply(split_day)
test_data_oil_holiday_transactions['Weekend'] = weekend(test_data_oil_holiday_transactions['date'])
test_data_oil_holiday_transactions['Weekday'] = weekday(test_data_oil_holiday_transactions['date'])

# 3. EDA - The relation between each feature and sale's means / medians

In [None]:
train_data_oil_holiday_transactions.rename(columns = {'type_x' : 'holiday_type', 'type_y' : 'shop_type'}, inplace = True)
test_data_oil_holiday_transactions.rename(columns = {'type_x' : 'holiday_type', 'type_y' : 'shop_type'}, inplace = True)

## 3-0. Discuss the disaster day 2016-04-16 to 2016-10-16

In [None]:
disaster = train_data_oil_holiday_transactions[(train_data_oil_holiday_transactions['date'] >= '2016-04-16') & (train_data_oil_holiday_transactions['date'] <= '2016-10-16')]
disaster.sort_values('sales', ascending = False)

*The sales is too high in some days , if I don't want these data to influence the Model, I could delete these data*

In [None]:
train_data_oil_holiday_transactions =train_data_oil_holiday_transactions.drop(disaster.index)

## 3-1. Store_nbr features relate the sales means / middle

In [None]:
store_nbr_sales_means = train_data_oil_holiday_transactions.groupby('store_nbr').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
store_nbr_sales_medians = train_data_oil_holiday_transactions.groupby('store_nbr').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.lineplot(x = store_nbr_sales_means.store_nbr, y = store_nbr_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.lineplot(x = store_nbr_sales_medians.store_nbr, y = store_nbr_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Store_nbr : Comparsion with Mean and Middle')

### The different stroe_nbr is powerful influence the sales, specially the 40~ 50 and 0~5

## 3-2. Family features relate the sales means / middle

In [None]:
family_sales_mean = train_data_oil_holiday_transactions.groupby('family').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
family_sales_median = train_data_oil_holiday_transactions.groupby('family').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.set()
plt.figure(figsize = (20, 5))
sns.barplot(x = family_sales_mean.family, y = family_sales_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = family_sales_median.family, y = family_sales_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('Family : Comparsion with Mean and Middle')

### The GROCERY_I , BEVERAGES, CLEANING, DAIRY is more powerfel influence the sales mean / middle

## 3-3. Onpromotion features relate the sales means / middle

In [None]:

onpromotion_sales_means = train_data_oil_holiday_transactions.groupby('onpromotion').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
onpromotion_sales_medians = train_data_oil_holiday_transactions.groupby('onpromotion').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.lineplot(x = onpromotion_sales_means.onpromotion, y = onpromotion_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.lineplot(x = onpromotion_sales_medians.onpromotion, y = onpromotion_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Hoilday_type : Comparsion with Mean and Middle')

### The onpromotion feature is influence the sales

## 3-4. Dcoilwtico features relate the sales means / middle

In [None]:

dcoilwtico_sales_means = train_data_oil_holiday_transactions.groupby('dcoilwtico').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
dcoilwtico_sales_medians = train_data_oil_holiday_transactions.groupby('dcoilwtico').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.lineplot(x = dcoilwtico_sales_means.dcoilwtico, y = dcoilwtico_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.lineplot(x = dcoilwtico_sales_medians.dcoilwtico, y = dcoilwtico_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Dcoilwtico : Comparsion with Mean and Middle')
plt.show()
plt.clf()

### The dcoilwtico is powerful influence the sales means / middle

## 3-5. Holiday_type features relate the sales means / middle

In [None]:
holiday_type_sales_means = train_data_oil_holiday_transactions.groupby('holiday_type').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
holiday_type_sales_medians = train_data_oil_holiday_transactions.groupby('holiday_type').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = holiday_type_sales_means.holiday_type, y = holiday_type_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = holiday_type_sales_medians.holiday_type, y = holiday_type_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Hoilday_type : Comparsion with Mean and Middle')

## 3-6. Locale features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
locale_sales_mean = train_data_oil_holiday_transactions.groupby('locale').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
locale_sales_median = train_data_oil_holiday_transactions.groupby('locale').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = locale_sales_mean.locale, y = locale_sales_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = locale_sales_median.locale, y = locale_sales_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('locale Comparsion with Mean and Middle')

## 3-7. Locale_name features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
locale_name_sales_mean = train_data_oil_holiday_transactions.groupby('locale_name').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
locale_name_sales_median = train_data_oil_holiday_transactions.groupby('locale_name').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = locale_name_sales_mean.locale_name, y = locale_name_sales_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = locale_name_sales_median.locale_name, y = locale_name_sales_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('locale_name Comparsion with Mean and Middle')

## 3-8. Description features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
description_sales_mean = train_data_oil_holiday_transactions.groupby('description').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
description_sales_median = train_data_oil_holiday_transactions.groupby('description').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = description_sales_mean.description, y = description_sales_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = description_sales_median.description, y = description_sales_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('description Comparsion with Mean and Middle')

## 3-9. Transferred features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
transferred_mean = train_data_oil_holiday_transactions.groupby('transferred').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
transferred_median = train_data_oil_holiday_transactions.groupby('transferred').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = transferred_mean.transferred, y = transferred_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = transferred_median.transferred, y = transferred_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('transferred Comparsion with Mean and Middle')

## 3-10. City features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
city_mean = train_data_oil_holiday_transactions.groupby('city').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
city_median = train_data_oil_holiday_transactions.groupby('city').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = city_mean.city, y = city_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = city_median.city, y = city_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('city Comparsion with Mean and Middle')

## 3-11. State features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
state_mean = train_data_oil_holiday_transactions.groupby('state').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
state_median = train_data_oil_holiday_transactions.groupby('state').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = state_mean.state, y = state_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = state_median.state, y = state_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('state Comparsion with Mean and Middle')

## 3-12. Shop_type features relate the sales means / middle

In [None]:
sns.set()
plt.figure(figsize = (20, 5))
shop_type_mean = train_data_oil_holiday_transactions.groupby('shop_type').agg({"sales" : "mean"}).reset_index().sort_values(by='sales', ascending=False)[:20]
shop_type_median = train_data_oil_holiday_transactions.groupby('shop_type').agg({"sales" : np.median}).reset_index().sort_values(by='sales', ascending=False)[:20]
sns.barplot(x = shop_type_mean.shop_type, y = shop_type_mean.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = shop_type_median.shop_type, y = shop_type_median.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.xticks(rotation = 90)
plt.title('state Comparsion with Mean and Middle')

## 3-13. Cluster features relate the sales means / middle

In [None]:

cluster_sales_means = train_data_oil_holiday_transactions.groupby('cluster').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
cluster_sales_medians = train_data_oil_holiday_transactions.groupby('cluster').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.lineplot(x = cluster_sales_means.cluster, y = cluster_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.lineplot(x = cluster_sales_medians.cluster, y = cluster_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('cluster : Comparsion with Mean and Middle')
plt.show()
plt.clf()

## 3-14. Year features relate the sales means / middle

In [None]:

Year_sales_means = train_data_oil_holiday_transactions.groupby('Year').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
Year_sales_medians = train_data_oil_holiday_transactions.groupby('Year').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = Year_sales_means.Year, y = Year_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = Year_sales_medians.Year, y = Year_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Year : Comparsion with Mean and Middle')
plt.show()
plt.clf()

## 3-15. Month features relate the sales means / middle

In [None]:
Month_sales_means = train_data_oil_holiday_transactions.groupby('Month').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
Month_sales_medians = train_data_oil_holiday_transactions.groupby('Month').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = Month_sales_means.Month, y = Month_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = Month_sales_medians.Month, y = Month_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Month : Comparsion with Mean and Middle')
plt.show()
plt.clf()

## 3-16. Day features relate the sales means / middle

In [None]:
Day_sales_means = train_data_oil_holiday_transactions.groupby('Day').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
Day_sales_medians = train_data_oil_holiday_transactions.groupby('Day').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = Day_sales_means.Day, y = Day_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = Day_sales_medians.Day, y = Day_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Day : Comparsion with Mean and Middle')
plt.show()
plt.clf()

## 3-17. Weekend features relate the sales means / middle

In [None]:
Weekend_sales_means = train_data_oil_holiday_transactions.groupby('Weekend').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
Weekend_sales_medians = train_data_oil_holiday_transactions.groupby('Weekend').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = Weekend_sales_means.Weekend, y = Weekend_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = Weekend_sales_medians.Weekend, y = Weekend_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Weekend : Comparsion with Mean and Middle')
plt.show()
plt.clf()

## 3-18. Weekday features relate the sales means / middle

In [None]:
Weekday_sales_means = train_data_oil_holiday_transactions.groupby('Weekday').agg({'sales' : 'mean'}).reset_index().sort_values(by='sales', ascending=False)
Weekday_sales_medians = train_data_oil_holiday_transactions.groupby('Weekday').agg({'sales' : np.median}).reset_index().sort_values(by='sales', ascending=False)

sns.set()
plt.figure(figsize = (20,5))
sns.barplot(x = Weekday_sales_means.Weekday, y = Weekday_sales_means.sales, color = 'r', label = 'Means', alpha = 0.3)
sns.barplot(x = Weekday_sales_medians.Weekday, y = Weekday_sales_medians.sales, color = 'g', label = 'Middle', alpha = 0.3)
plt.legend()
plt.title('Weekday : Comparsion with Mean and Middle')
plt.show()
plt.clf()

# 4. Data Feature's drop

In [None]:
train_data_oil_holiday_transactions = train_data_oil_holiday_transactions.drop(columns = ['id', 'date', 'transferred', 'Day', 'Weekday'])
test_data_oil_holiday_transactions = test_data_oil_holiday_transactions.drop(columns = ['id', 'date', 'transferred', 'Day', 'Weekday'])

# 5. Feature's encoder by LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder_family = LabelEncoder()
train_data_oil_holiday_transactions['family'] = encoder_family.fit_transform(train_data_oil_holiday_transactions['family'])
test_data_oil_holiday_transactions['family'] = encoder_family.transform(test_data_oil_holiday_transactions['family'])

encoder_type = LabelEncoder()
train_data_oil_holiday_transactions['holiday_type'] = encoder_type.fit_transform(train_data_oil_holiday_transactions['holiday_type'])
test_data_oil_holiday_transactions['holiday_type'] = encoder_type.transform(test_data_oil_holiday_transactions['holiday_type'])

encoder_locale = LabelEncoder()
train_data_oil_holiday_transactions['locale'] = encoder_locale.fit_transform(train_data_oil_holiday_transactions['locale'])
test_data_oil_holiday_transactions['locale'] = encoder_locale.transform(test_data_oil_holiday_transactions['locale'])

encoder_description = LabelEncoder()
train_data_oil_holiday_transactions['description'] = encoder_description.fit_transform(train_data_oil_holiday_transactions['description'])
test_data_oil_holiday_transactions['description'] = encoder_description.transform(test_data_oil_holiday_transactions['description'])

encoder_locale_name = LabelEncoder()
train_data_oil_holiday_transactions['locale_name'] = encoder_locale_name.fit_transform(train_data_oil_holiday_transactions['locale_name'])
test_data_oil_holiday_transactions['locale_name'] = encoder_locale_name.transform(test_data_oil_holiday_transactions['locale_name'])

encoder_city = LabelEncoder()
train_data_oil_holiday_transactions['city'] = encoder_city.fit_transform(train_data_oil_holiday_transactions['city'])
test_data_oil_holiday_transactions['city'] = encoder_city.transform(test_data_oil_holiday_transactions['city'])

encoder_state = LabelEncoder()
train_data_oil_holiday_transactions['state'] = encoder_state.fit_transform(train_data_oil_holiday_transactions['state'])
test_data_oil_holiday_transactions['state'] = encoder_state.transform(test_data_oil_holiday_transactions['state'])

encoder_shop_type = LabelEncoder()
train_data_oil_holiday_transactions['shop_type'] = encoder_shop_type.fit_transform(train_data_oil_holiday_transactions['shop_type'])
test_data_oil_holiday_transactions['shop_type'] = encoder_shop_type.transform(test_data_oil_holiday_transactions['shop_type'])

# 6. XGBoostRegressor

In [None]:
data = train_data_oil_holiday_transactions.drop(columns = 'sales')
target = train_data_oil_holiday_transactions['sales']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8, random_state = 5)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
XG = xgb.XGBRegressor(objective = 'reg:squarederror' , learning_rate = 0.1,
                max_depth = 3, n_estimators = 500).fit(x_train, y_train)
y_pred_XG = XG.predict(x_test)
def relu(x):
    relu = []
    for i in x: 
        if i < 0:
            relu.append(0)
        else:
            relu.append(i)
    return relu

plt.scatter(y_test, relu(y_pred_XG))
plt.plot([10000*x for x in range(10)], [10000*x for x in range(10)], color = 'r')
plt.xlabel("Reality")
plt.ylabel("Predicted")
plt.title('CatBoostRegressor')
plt.show()
plt.clf()
print(XG.score(x_test, y_test))

# 4. Submission

In [None]:
sub = XG.predict(test_data_oil_holiday_transactions)
sample_submission['sales'] = relu(sub)
sample_submission.to_csv('submission.csv', index=False)

In [None]:
sample_submission