In [None]:
import pandas as pd

In [None]:
!ls ..

### Load the data

In [None]:
df = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
extraInfo = pd.read_csv('../input/store.csv')

#### Sneak peek

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
extraInfo.head()

Remove Sales & Customers since those are missing from the test set (and since Customers is direclty related to sales)

In [None]:
def _inp(dataframe):
    return dataframe.drop(columns=['Sales', 'Customers'])
_inp(df)

In [None]:
def _out(dataframe):
    return dataframe['Sales']
_out(df)

#### No need to compute closed shops. No need to estimate 0 sales

In [None]:
df = df[df.Open != 0]

In [None]:
df.shape

In [None]:
[(column, sum(df[column].isna())) for column in df.columns]

#### Split the Date into smaller components

In [None]:
def splitDate(dataframe):
    dataframe = pd.concat([dataframe, dataframe['Date'].str.split('-', n = 2, expand = True)], axis=1, sort=False)
    dataframe.rename(columns={0:'date_year', 1:'date_month', 2:'date_day'}, inplace=True)
    return dataframe

In [None]:
from sklearn.preprocessing import LabelEncoder

def labelize(dataframe, column='StateHoliday'):
    labelizer = LabelEncoder()
    dataframe[column] = labelizer.fit_transform(dataframe[column].astype('str'))
    return dataframe

In [None]:
def formatDataframe(dataframe):
    dataframe = splitDate(dataframe)
    dataframe = labelize(dataframe)
    return dataframe

In [None]:
df = formatDataframe(df)
df.head()

In [None]:
def _inp(dataframe):
    return dataframe.drop(columns=['Sales', 'Customers', 'Date']).astype('float64')
_inp(df)

#### Split data

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_validation = train_test_split(df, test_size=0.3)
df_validation, df_test = train_test_split(df_validation, test_size=0.15)

# Basic model

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
m1 = Pipeline([
    ('normalizer', StandardScaler()),
    ('poli-features', PolynomialFeatures()),
    ('linear-model', LinearRegression())
])
m1.fit(_inp(df_train), _out(df_train))
m1.score(_inp(df_validation), _out(df_validation))

# Estimate

Format the test data

In [None]:
test = formatDataframe(test)
test.head()

In [None]:
[(column, sum(test[column].isna())) for column in test.columns]

In [None]:
test = test.fillna(1)

In [None]:
predictions = m1.predict(test.drop(columns=['Id','Date']).astype('float64'))

In [None]:
predictions.shape

In [None]:
pd.DataFrame(predictions).head()

# Construct the submission file

In [None]:
test[['Id', 'Open']].head()

In [None]:
final_predictions = test[['Id', 'Open']]
final_predictions['Sales'] = pd.DataFrame(predictions)

In [None]:
final_predictions.loc[final_predictions['Open'] == 0, 'Sales'] = 0 #hardcode 0 sales for closed shops

In [None]:
final_predictions[['Id', 'Sales']].to_csv('predictions.csv', index = False)