### Loading libraries

In [None]:
%%time
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

### Loading datasets

In [None]:
%%time
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')



### Missing values imputation

#### Training data

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head(2)

In [None]:
df_test.isnull().sum()

In [None]:
df_test.head(2)

### Histogram

#### Training set

In [None]:
df_train.hist()

#### Testing  set histogram

In [None]:
df_test.hist()

### Distribution of sales - for each item, date and store

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of sales - for each item, date and store")
ax = sns.distplot(df_train['sales'])

### Date splitting

#### Training data

In [None]:
df_train.date  = pd.to_datetime(df_train.date, format='%Y-%m-%d')

In [None]:
df_train['year'] = df_train.date.dt.year
df_train['month']=df_train.date.dt.month
df_train['day']=df_train.date.dt.day

#### Testing data

In [None]:

df_test.date  = pd.to_datetime(df_test.date, format='%Y-%m-%d')
df_test['year'] = df_test.date.dt.year
df_test['month']=df_test.date.dt.month
df_test['day']=df_test.date.dt.day

### Converting into categorical columns

In [None]:
df_train['year']=df_train['year'].astype('category')
df_train['month']=df_train['month'].astype('category')
df_train['day']=df_train['day'].astype('category')
df_train['store']=df_train['store'].astype('category')
df_train['item']=df_train['item'].astype('category')
df_train['sales']=df_train['sales'].astype('category')

In [None]:
df_train=df_train.drop(columns='date',axis=1)

In [None]:
df_train.dtypes

In [None]:
y=pd.DataFrame()
y['sales']=df_train['sales']
df_train=df_train.drop(columns='sales',axis=1)

In [None]:
df_train.head(2)

### Modelling

#### Modelling with  Random forest classifier

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0) 

In [None]:
x=df_train.iloc[:,0:5]   #Splitting input features

In [None]:
clf=clf.fit(x,y)    #Training with RF 

### Feature importance

In [None]:
for name, importance in zip(df_train.columns, clf.feature_importances_):     
    print(name, "=", importance)

    

In [None]:
features =df_train.columns
importances =clf.feature_importances_ 
indices = np.argsort(importances)
 
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='g')
plt.yticks(range(len(indices)), features[indices]) 
plt.xlabel('Relative Importance') 
plt.show()


### Sales distributions

In [None]:

df_train1=pd.read_csv('../input/train.csv')
pd.value_counts(df_train1.sales).plot(kind='bar', title='Sales distribution');

In [None]:
df_train1.head(2)

In [None]:
#converting into date format
df_train1.date  = pd.to_datetime(df_train1.date, format='%Y-%m-%d')
df_train1['year'] =df_train1.date.dt.year
df_train1['month']=df_train1.date.dt.month
df_train1['day']=df_train1.date.dt.day

#### pivot_table

In [None]:
# Let us understand the sales data distribution across the stores
store_df = df_train1.copy()
sales_pivoted_df = pd.pivot_table(store_df, index='store', values=['sales','date'], columns='item', aggfunc=np.mean)
# Pivoted dataframe
display(sales_pivoted_df)

### Pivot_table distribution

In [None]:
sales_pivoted_df = pd.pivot_table(store_df, index='store', values=['sales','date'], columns='item', aggfunc=np.mean)
sales_pivoted_df.plot(figsize=(12,12));

### Pivot table month distributions

In [None]:
pivoted = pd.pivot_table(df_train1, values='sales' , columns='month', index='day')
pivoted

In [None]:
pivoted = pd.pivot_table(df_train1, values='sales' , columns='month', index='day')
pivoted.plot(figsize=(12,12));

### Histogram -Sales distribution

In [None]:
df_train1.sales.hist(bins=100, figsize=(14,3))
plt.xlabel('Sales distribution')
plt.title('Histogram');

In [None]:
df_train1.store.hist(bins=100, figsize=(14,3))
plt.xlabel('Sales distribution')
plt.title('Histogram');

### Unique value counts- store

In [None]:
df_train1['store'].value_counts()

### Moving average

In [None]:


df_raw = pd.read_csv('../input/train.csv', parse_dates=['date'], index_col=['date'])
#loading training data

In [None]:
df_raw.head() #displaying training data head

###### dropping store and item columns

In [None]:
date_sales = df_raw.drop(['store','item'], axis=1).copy() 

##### Checking type of every columns

In [None]:
date_sales.get_ftype_counts()

In [None]:
date_sales.head()

### starting date of month and  sale visualization

In [None]:
y = date_sales['sales'].resample('MS').mean() #MS-month starting date


In [None]:
y['2013':]

In [None]:
y.plot(figsize=(16,5));
plt.xlabel('Sales ')
plt.title('Month average sales');

### Total sales average store with respect to stores

In [None]:
df_tr1= pd.read_csv('../input/train.csv', parse_dates=['date']) #loading training data

In [None]:
df_tr1.head() #displaying head of traing data

###### Splitting date column

In [None]:
df_tr1.date  = pd.to_datetime(df_tr1.date, format='%Y-%m-%d')
df_tr1['year'] = df_tr1.date.dt.year
df_tr1['month']=df_tr1.date.dt.month
df_tr1['day']=df_tr1.date.dt.day

In [None]:
df_tr1.groupby('store')['sales'].mean().plot(figsize=(16,5));
plt.xlabel('Store names ')

plt.title('Total sales average store with respect to stores');

### Sales with respect to year

In [None]:
plt.plot(df_tr1.year, df_tr1['sales'],color='lightgreen')
plt.title('Sales with respect to year')
plt.ylabel('Sales');
plt.show()

In [None]:
sales = df_tr1[['sales']]
sales.plot(figsize=(20,10), linewidth=5, fontsize=20)
plt.xlabel('Year ', fontsize=20);
plt.ylabel('Sales', fontsize=20);

df_tr1.sales.plot(style='m.',kind='kde')
plt.show()

### Monthly sales distibution

In [None]:
sns.lmplot(x='month', y='sales', fit_reg=False, data=df_tr1);