In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

> # Read the data

In [None]:
oil_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
holidays_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
stores_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
train_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
transactions_df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

> **train and test data frame are the same features except sales in train data**

In [None]:
oil_df.head()

In [None]:
holidays_df.head()

In [None]:
stores_df.head()

In [None]:
transactions_df.head()

 > **Wrangle the train data frame**

In [None]:
train_df.isnull().sum()

In [None]:
train_df.duplicated().sum()

In [None]:
round(train_df.describe(), 2)

> # EDA

In [None]:
train_df.id.nunique()

In [None]:
round(train_df.sales.describe(), 2)

In [None]:
train_df.info()

In [None]:
train_df.date = pd.to_datetime(train_df.date)

In [None]:
plt.figure(figsize=(15,8))
plt.plot(train_df.date, train_df.sales)
plt.show()

> **Group Sales by month**

In [None]:
months_sales = train_df.groupby(train_df['date'].dt.strftime('%B'))['sales'].sum().sort_values()

In [None]:
plt.figure(figsize=(15,8))
plt.plot(months_sales)
plt.title("the sales by month", fontsize=15)
plt.xlabel("the month", fontsize=13)
plt.ylabel("the sales", fontsize=13)
plt.show()

> **Sales Per year**

In [None]:
plt.figure(figsize=(15,8))
train_df.groupby(pd.Grouper(key='date', freq='2D')).sum()['sales'].plot()
plt.ylabel("the sales", fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
train_df.groupby(pd.Grouper(key='date', freq='1Y')).sum()['sales'].plot()
plt.title("the sales per year", fontsize=15)
plt.ylabel("the sales", fontsize=15)
plt.show()

> **Sales decreased after 2016**

In [None]:
train_df.head(2)

> **What's the most sales family**

In [None]:
plt.figure(figsize=(15, 8))
train_df.groupby("family")['family'].count().plot(kind='bar')
plt.title("the count of each family sales", fontsize=15)
plt.show()

> ALL has the same count

**Group on-pormotion by month**

In [None]:
months_pormotions = train_df.groupby(train_df['date'].dt.strftime('%B'))['onpromotion'].sum().sort_values()

In [None]:
plt.figure(figsize=(15,8))
plt.plot(months_pormotions)
plt.title("the pormotions by month", fontsize=15)

plt.xlabel("the month", fontsize=13)
plt.ylabel("the pormotions", fontsize=13)
plt.show()

**Group pormotions by year**

In [None]:
plt.figure(figsize=(10,8))
train_df.groupby(pd.Grouper(key='date', freq='1Y')).sum()['onpromotion'].plot()
plt.title("the promotion per year", fontsize=15)
plt.ylabel("the promotions", fontsize=15)
plt.show()

> **The sales per store**

In [None]:
stores_sales = train_df.groupby('store_nbr')['sales'].sum()

In [None]:
stores_sales.sort_values(inplace=True)

In [None]:
round(stores_sales, 2)

> **The top 10 sales per store**

In [None]:
plt.figure(figsize=(10, 8))
stores_sales[-10:].plot(kind='bar')
plt.title("The top 10 sales per store", fontsize=15)
plt.xlabel("the store")
plt.ylabel("the sales")
plt.show()

>**Stores data frame EDA** 

In [None]:
stores_df.info()

In [None]:
stores_df.head(2)

In [None]:
sales_nums = stores_sales.index

In [None]:
top_stores = sales_nums[-10:]

In [None]:
cities = []
i = 0
for city in stores_df.city:
    if stores_df.store_nbr[i] in  top_stores:
        cities.append(stores_df.city[i])
    i += 1

In [None]:
cities

In [None]:
cities = np.array(cities)
cities = pd.Series(cities)

> **The top sales per city**

In [None]:
cities.hist()
plt.title("the freq of top sales stores per city")
plt.ylabel("the count of top stores on the city")
plt.show()

> #   Modeling

> **DecisionTreeRegressor**

In [None]:
train_df.date = pd.to_numeric(train_df.date)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
train_df.family = le.fit_transform(train_df.family)

In [None]:
train_df.head(2)

In [None]:
X = train_df.drop(['sales', 'id', 'date'], axis = 1).values
y = train_df.sales.values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
regressor = DecisionTreeRegressor(max_depth=5, max_leaf_nodes=12, random_state=42)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
regressor.score(X_train, y_train)

In [None]:
regressor.score(X_test, y_test)

> **Linear Regression**

> Score: 59

In [None]:
plt.scatter(train_df.onpromotion, train_df.sales)

In [None]:
train_df.corr()

In [None]:
# from sklearn.linear_model import LinearRegression

In [None]:
# regressor = LinearRegression()

In [None]:
# train_df.iloc[:,-1:]

In [None]:
# X = train_df.iloc[:,-1:].values
# y = train_df.sales.values

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
# regressor.fit(X_train, y_train)

In [None]:
# regressor.score(X_test, y_test)

> **XGBoost Regressor**

In [None]:
import xgboost as xg

In [None]:
xgb_r = xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 10, seed = 123, max_depth=10)

In [None]:
xgb_r.fit(X_train, y_train)

In [None]:
xgb_r.score(X_test, y_test)

> # Predictions 

In [None]:
test_df.head(2)

In [None]:
test_df.family = le.fit_transform(test_df.family)

In [None]:
test_df.head(2)

In [None]:
X = test_df.drop(["id", "date"], axis=1).values
X.shape

In [None]:
X[0]

> DS tree regressor

In [None]:
# predictions = regressor.predict(X)

> XGBosst Regressor

In [None]:
predictions = xgb_r.predict(X)

In [None]:
test_df['sales'] = predictions

In [None]:
test_df.head(2)

In [None]:
test_df.drop(['store_nbr', 'family', 'onpromotion', 'date'], axis=1, inplace=True)

In [None]:
test_df.head(2)

In [None]:
test_df.to_csv('submission.csv', index=False)
print("submission successed")