In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
# Read train, stores and features data
test = pd.read_csv('data/test.csv')
stores = pd.read_csv('data/stores.csv')
features = pd.read_csv('data/features.csv')
train = pd.read_csv('data/train.csv')
sampleSubmission = pd.read_csv('data/sampleSubmission.csv')

In [3]:
# Percentage of 'test' data
((test.count()[1])/((test.count()[1])+(train.count()[1])))*100

21.441802047578051

#### Exploring test, stores and features data 

In [4]:
test.head()

Unnamed: 0,Store,Dept,Date,IsHoliday
0,1,1,2012-11-02,False
1,1,1,2012-11-09,False
2,1,1,2012-11-16,False
3,1,1,2012-11-23,True
4,1,1,2012-11-30,False


In [5]:
stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [6]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


#### Merging data from 'train' and 'stores'

In [7]:
sample1 = pd.merge(train, stores)

In [8]:
sample1.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size
0,1,1,2010-02-05,24924.5,False,A,151315
1,1,1,2010-02-12,46039.49,True,A,151315
2,1,1,2010-02-19,41595.55,False,A,151315
3,1,1,2010-02-26,19403.54,False,A,151315
4,1,1,2010-03-05,21827.9,False,A,151315


In [48]:
sample2 = pd.merge(sample1, features, how='left', left_on=['Store', 'Date'])

TypeError: object of type 'NoneType' has no len()

#### Unable to fix the error. Hence, concatinating two columns in 'features' & 'sample1' and merging data

In [9]:
# Concatenating 'store' and 'date' columns in 'features' dataframe
features['sd-concat'] = features['Store'].map(str)+'-'+features['Date'].map(str)

In [10]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,sd-concat
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,1-2010-02-05
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True,1-2010-02-12
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False,1-2010-02-19
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False,1-2010-02-26
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False,1-2010-03-05


In [11]:
# Concatenating 'store' and 'date' columns in 'features' dataframe
sample1['sd-concat'] = sample1['Store'].map(str)+'-'+sample1['Date'].map(str)

In [12]:
sample1.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,sd-concat
0,1,1,2010-02-05,24924.5,False,A,151315,1-2010-02-05
1,1,1,2010-02-12,46039.49,True,A,151315,1-2010-02-12
2,1,1,2010-02-19,41595.55,False,A,151315,1-2010-02-19
3,1,1,2010-02-26,19403.54,False,A,151315,1-2010-02-26
4,1,1,2010-03-05,21827.9,False,A,151315,1-2010-03-05


#### Merging all the data into one dataframe

In [13]:
sample2 = pd.merge(sample1, features)

In [14]:
sample2

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,sd-concat,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.50,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
2,1,3,2010-02-05,13740.12,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
3,1,4,2010-02-05,39954.04,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
4,1,5,2010-02-05,32229.38,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
5,1,6,2010-02-05,5749.03,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
6,1,7,2010-02-05,21084.08,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
7,1,8,2010-02-05,40129.01,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
8,1,9,2010-02-05,16930.99,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
9,1,10,2010-02-05,30721.50,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106


#### Merging 'test' data similarly

In [15]:
sample3 = pd.merge(test, stores)
sample3.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size
0,1,1,2012-11-02,False,A,151315
1,1,1,2012-11-09,False,A,151315
2,1,1,2012-11-16,False,A,151315
3,1,1,2012-11-23,True,A,151315
4,1,1,2012-11-30,False,A,151315


In [16]:
sample3['sd-concat'] = sample3['Store'].map(str)+'-'+sample3['Date'].map(str)

In [17]:
sample4 = pd.merge(sample3, features)

In [18]:
sample4

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size,sd-concat,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
1,1,2,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
2,1,3,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
3,1,4,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
4,1,5,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
5,1,6,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
6,1,7,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
7,1,8,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
8,1,9,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573
9,1,10,2012-11-02,False,A,151315,1-2012-11-02,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573


### sample2 (train data); sample4 (test data)

Date is string in 'test'

In [24]:
df_train = sample2.copy(deep=True)
df_test = sample4.copy(deep=True)

In [30]:
df_train['Just_Date'] = pd.to_datetime(df_train['Date'], errors='coerce')

In [32]:
type(df_train['Just_Date'][1])

pandas._libs.tslib.Timestamp

In [40]:
df_train['Just_Date'] = pd.to_datetime(df_train['Date'], format = '%Y%m%d %H:%M:%S.%f')

In [42]:
df_train['Just_Date'][1]

Timestamp('2010-02-05 00:00:00')

In [44]:
df_train['Date'] = datetime.strptime(df_train['Just_Date'], "%d%m%Y")

TypeError: strptime() argument 1 must be str, not Series

In [52]:
df_train['New_Date'] = df_train['Just_Date'].apply(lambda x: datetime.strptime(x, "%d%m%Y"))

TypeError: strptime() argument 1 must be str, not Timestamp

In [49]:
new_train = sample2.copy(deep=True)

In [50]:
new_train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,sd-concat,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
2,1,3,2010-02-05,13740.12,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
3,1,4,2010-02-05,39954.04,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106
4,1,5,2010-02-05,32229.38,False,A,151315,1-2010-02-05,42.31,2.572,,,,,,211.096358,8.106


In [60]:
df_train['New_Date'] = df_train['Date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [62]:
df_train['New_Date'].head()

0   2010-02-05
1   2010-02-05
2   2010-02-05
3   2010-02-05
4   2010-02-05
Name: New_Date, dtype: datetime64[ns]

In [57]:
from dateutil.parser import parse
a = parse(df_train['Date'][1]).date().isoformat()

In [58]:
type(a)

str

In [59]:
pd.to_datetime()

str