# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns
import random
import csv

# Import Data

In [None]:
!ls

In [None]:
dataTransactions = pd.read_csv('transactions.csv')
dataHolidays = pd.read_csv('holidays_events.csv')
#dataTrain = pd.read_csv('train.csv')
dataTrain = pd.read_csv('test.csv')
dataTest = pd.read_csv('test.csv')
dataStores = pd.read_csv('stores.csv')
dataItems = pd.read_csv('items.csv')

# Analyzing annual transactions 

In [None]:
#dataTrain['unit_sales'] = int(random.randint(1,6))
dataTest['unit_sales'] = 0
dataTransactions['year'] = dataTransactions['date'].apply(lambda row: int(row.split('-')[0]))
dataTransactions['month'] = dataTransactions['date'].apply(lambda row: int(row.split('-')[1]))

In [None]:
dataTransactions.head()

In [None]:
transactionsYear = pd.DataFrame(
    {'transactions' : dataTransactions.groupby( ['year'] )['transactions'].sum()}
).reset_index()

In [None]:
transactionsYear

In [None]:
sns.set_style("darkgrid")
ax = sns.factorplot(x="year", y="transactions",data=transactionsYear, size=5, aspect=1.5, kind="bar")
plt.show()

# Linear sales regression

#### Linear regression is a statistical technique used to study the relationship between variables and to predict their behavior, in the exercise the number of transactions is related to the period number, taking into account that the analysis period is monthly, you have a total of 56 periods 

In [None]:
transactionsMont = pd.DataFrame(
    {'transactions' : dataTransactions.groupby( ['year', 'month'] )['transactions'].sum()}
).reset_index()
transactionsMont['period'] = range(1, len(transactionsMont) +1)

In [None]:
# numero de periodos o meses disponibles para el calculo de ña regresion 
transactionsMont['period'].max()

#### It is observed that there is a close relationship between the period and the number of transactions, in addition to a notorious change in this relationship, occurs in the month of December.


In [None]:
#regresion lineal de ventas segun el numero de periodos
g = sns.lmplot(x="period", y="transactions", data=transactionsMont, size=7)
plt.show()

### al relacionar el numero de transacciones por año se obcerva que tiene un comportamiento acendente 

In [None]:
#regresion lineal de ventas segun por anio
g = sns.lmplot(x="period", y="transactions", data=transactionsMont, size=7, hue="year")
plt.show()

# Sales by city

In [None]:
# join de los datos
dataTrainState = dataTransactions.join(dataStores.set_index('store_nbr'), on='store_nbr').reset_index()
dataTrainState.head() 

In [None]:
dataTrainStateCount = pd.DataFrame(
    {'transactions' : dataTrainState.groupby( ['year', 'state'] )['transactions'].sum()}
).reset_index()
dataTrainStateCount.head()

### transactions are related to the province to determine its impact on sales

In [None]:
g = sns.factorplot(x="year", y="transactions", col = 'state', data=dataTrainStateCount, kind="bar", size=2.7, col_wrap=3);
plt.show()

In [None]:
g = sns.factorplot(x="year", y="transactions", hue='state', data=dataTrainStateCount, size=6);
plt.show()

### it is observed that one of the provinces with the greatest influence is Pichincha, but in spite of that the index of variability between the provinces is very similar

In [None]:
g = sns.lmplot(x="month", y="transactions", data=dataTrainState, size=7, hue="state")
plt.show()

# Sales by stores

In [None]:
dataTrainStorade = pd.DataFrame(
    {'transactions' : dataTransactions.groupby( ['store_nbr'] )['transactions'].sum()}
).reset_index()
dataTrainStorade = dataTrainStorade.sort_values(['transactions'], ascending=[False])
dataTrainStorade.head()

### Classification of transactions according to the store

In [None]:
g = sns.factorplot(x="transactions", y="store_nbr", data=dataTrainStorade, size=10,  kind="bar", orient="h", order=dataTrainStorade.store_nbr)
plt.show()

# Holiday sales

In [None]:
# join de los datos
dataTrainHoliday = dataTransactions.join(dataHolidays.set_index('date'), on='date').reset_index()
dataTrainHoliday = pd.DataFrame(
    {'transactions' : dataTrainHoliday.groupby( ['description'])['transactions'].sum()}
).reset_index()
dataTrainHoliday['transactions'].min()

In [None]:
g = sns.factorplot(x="transactions", y="description", data=dataTrainHoliday, kind="bar", size=20);
plt.show()

# Product analysis

In [None]:
dataItemFamily = dataTrain.join(dataItems.set_index('item_nbr'), on='item_nbr').reset_index()
dataItemFamily.head()

In [None]:
dataItemFamily['id'].max()

In [None]:
dataItemFamilyStore = pd.DataFrame(
    {'transactions' : dataItemFamily.groupby( ['family', 'store_nbr']).size()}
).reset_index()
dataItemFamilyStore = dataItemFamilyStore.join(dataStores.set_index('store_nbr'), on='store_nbr').reset_index()
dataItemFamilyStore.head()

In [None]:
len(dataItemFamilyStore)

In [None]:
### se relaciona tanto la tienda y su influencia y transacionabilidad de los productos

In [None]:
g = sns.factorplot(x="transactions", y="family", col="store_nbr", data=dataItemFamilyStore, kind="bar", size=5, col_wrap=3);
plt.show()

In [None]:
dataItemFamilyCity = pd.DataFrame(
    {'transactions' : dataItemFamilyStore.groupby( ['state', 'family'])['transactions'].sum()}
).reset_index()
dataItemFamilyCity.head()

### The product is related to the region to determine consumption by province

In [None]:
g = sns.factorplot(x="transactions", y="state", col="family", data=dataItemFamilyCity, kind="bar", size=5, col_wrap=3);
plt.show()

# Generation of predictions

In [None]:
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':str}
dataTrain = pd.read_csv('../input/train.csv', dtype=dtypes)
dataTest = pd.read_csv('../input/test.csv', dtype=dtypes)
dataTest['unit_sales'] = 0
#dataTrain['unit_sales'] = int(random.randint(1,6))

In [None]:
# graficos embebidos
%matplotlib inline
dataTrain['day'] = dataTrain['date'].apply(lambda row: int(row.split('-')[2]))
dataTrain['date'] = dataTrain['date'].apply(lambda row: int(row.split('-')[1]))
dataTrain['onpromotion'] = dataTrain['onpromotion'].apply(lambda row: 1 if row == True else 0)
dataTrain.head()

In [None]:
feature_cols = ['date', 'store_nbr', 'item_nbr', 'day']

## Ajustando el modelo

In [None]:
from sklearn.linear_model import LinearRegression

rl = LinearRegression() # Creando el modelo.
rl.fit(np.array(dataTrain[feature_cols]), np.array(dataTrain['unit_sales'])) # ajustando el modelo x, y

#### Generando predicciones

In [None]:
dataTest['onpromotion'] = dataTest['onpromotion'].apply(lambda row: 1 if row == True else 0)
dataTest['day'] = dataTest['date'].apply(lambda row: int(row.split('-')[2]))
dataTest['date'] = dataTest['date'].apply(lambda row: int(row.split('-')[1]))

In [None]:
predicciones = rl.predict(np.array(dataTest[feature_cols]))
predicciones_df = pd.DataFrame({'id': dataTest['id'], 'unit_sales': predicciones.round()})
predicciones_df.head() # predicciones de las primeras 5 lineas

In [None]:
print(rl.intercept_)
print(rl.coef_)

In [None]:
# filas requeridas 3370464
len(dataTrain['day'])

In [None]:
predicciones_df.to_csv('predicciones.csv', index=False, float_format='%.2f')

# Conclusions
#### After having analyzed the relevant data, the following conclusions have been reached
#### 1) The average annual transactions is 30295802.6
#### 2) The year with the most bandages was in 2015 with a number of 31312370
#### 3) The total number of sales for 2017 will be an approximate number to 30000204
#### 4) The province with the highest number of sales is Pichincha while the one with the lowest sales is Pastaza
#### 5) The day that most sales are made is the day of carnival with a number of 773458 transactions recorded to date
#### 6) The day with the lowest sales made is the first day of the year with a number of 5941 transactions registered to date
#### 7) The family of products with the most sales concurrency is GROCERY I
#### 8) The store with the most registered sales is 44 located in Pichincha with a number of 6201115 transactions registered to date