### Loading Libraries

In [None]:
from datetime import datetime, timedelta,date
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix

### Loading Data

In [None]:
sales_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
sample_submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')


In [None]:
sales_train.head(10)

In [None]:
test.head(10)

In [None]:
sales_train.info()

**The data type of date column is object we are gonna change it to date**

In [None]:
sales_train['date'] = pd.to_datetime(sales_train['date'])

### Check for the missing values in train and test datas

In [None]:
sales_train.isna().sum()

In [None]:
test.isna().sum()

**As we can see there are no missing values in the sales_train and test dataframes, that's a good news.**

# EDA

In [None]:
sns.set(rc={'figure.figsize':(20, 10)})
sns.set_context("talk", font_scale=1)
sales_month_shop_id = pd.DataFrame(sales_train.groupby(['shop_id']).sum().item_cnt_day).reset_index()
sales_month_shop_id.columns = ['shop_id', 'sum_sales']
sns.barplot(x ='shop_id', y='sum_sales', data=sales_month_shop_id, palette='Paired')
plt.title('Distribution of sales per shop');
del sales_month_shop_id



In [None]:
sales_item_id = pd.DataFrame(sales_train.groupby(['item_id']).sum().item_cnt_day)
plt.xlabel('item id')
plt.ylabel('sales')
plt.plot(sales_item_id);

In [None]:
anom_item = sales_item_id.item_cnt_day.argmax()
print(anom_item)

In [None]:
items[items['item_id'] == 20602]

In [None]:
sns.set_context("talk", font_scale=0.8)
sales_item_cat = sales_train.merge(items, how='left', on='item_id').groupby('item_category_id').item_cnt_day.sum()
sns.barplot(x ='item_category_id', y='item_cnt_day',
            data=sales_item_cat.reset_index(), 
            palette='Paired'
           );
del sales_item_cat

### Check for any outliers

In [None]:
sns.set(style = "whitegrid")
plt.plot(sales_train['item_id'], sales_train['item_price'], '*', color='MediumBlue');

**It looks like we have one outlier, let's see what it is.**

In [None]:
sales_train[sales_train['item_price'] > 250000]

In [None]:
items[items['item_id'] == 6066]

In [None]:
item_categories[item_categories['item_category_id'] == 75]

In [None]:
shops[shops['shop_id'] == 12]

**Make things ready for further analysis**

In [None]:
sales_train_sub = sales_train
sales_train_sub['month'] = pd.DatetimeIndex(sales_train_sub['date']).month
sales_train_sub['year'] = pd.DatetimeIndex(sales_train_sub['date']).year
sales_train_sub.head(10)

In [None]:
monthly_sales=sales_train_sub.groupby(["date_block_num","shop_id","item_id"])["item_cnt_day"].agg(item_cnt_day = 'sum')

monthly_sales['date_block_num'] = monthly_sales.index.get_level_values('date_block_num') 
monthly_sales['shop_id'] = monthly_sales.index.get_level_values('shop_id') 
monthly_sales['item_id'] = monthly_sales.index.get_level_values('item_id') 
monthly_sales.reset_index(drop=True, inplace=True)

monthly_sales = monthly_sales.reindex(['date_block_num','shop_id','item_id','item_cnt_day'], axis=1)
monthly_sales.head(10)

# Approach 1: Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
#split dataset in features and target variable
feature_cols = ['shop_id','date_block_num','item_id']
X = monthly_sales[feature_cols] # Features
y = monthly_sales.item_cnt_day # Target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

### Building Decision Tree Model

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

### Optimizing Decision Tree Performance


    criterion : optional (default=”gini”) or Choose attribute selection measure: This parameter allows us to use the different-different attribute selection measure. Supported criteria are “gini” for the Gini index and “entropy” for the information gain.

    splitter : string, optional (default=”best”) or Split Strategy: This parameter allows us to choose the split strategy. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

    max_depth : int or None, optional (default=None) or Maximum Depth of a Tree: The maximum depth of the tree. If None, then nodes are expanded until all the leaves contain less than min_samples_split samples. The higher value of maximum depth causes overfitting, and a lower value causes underfitting.
    Source = [https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html](http://).


In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
#clf = tree.DecisionTreeClassifier(criterion='gini')

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Approach 2: Decision Tree Regressor

In [None]:
# Predictor columns
X2 = monthly_sales[feature_cols]

# Target variable
Y2 = monthly_sales.item_cnt_day

# Fitting Simple Linear Regression model to the data set
from sklearn.tree import DecisionTreeRegressor
model_DTR = DecisionTreeRegressor(random_state = 0)
model_DTR.fit(X2, Y2)



In [None]:
X2_test_DTR = test[['shop_id','item_id']]
X2_test_DTR.insert(loc=1, column='date_block_num', value='34')


predicted_raw_DTR = pd.DataFrame(model_DTR.predict(X2_test_DTR))
predicted_raw_DTR = X2_test_DTR.join(predicted_raw_DTR)

predicted_raw_DTR.columns  = ['shop_id', 'date_block_num','item_id', 'item_cnt']
predicted_DTR = predicted_raw_DTR.reindex(['shop_id','date_block_num','item_id','item_cnt'], axis=1)


In [None]:
predicted_DTR.head(20)