In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_excel('datasets/gold_trade.xlsx')

In [45]:
df = df.rename(index=str, columns={u"روز":"day", u"ماه":"month", u"سال":"year", 
                              u"شماره فروشگاه":"store_num", u"تعداد خرید روزانه":"daily_purchase",
                              u"تعداد فروش روزانه":"daily_sale", u"\u0642\u06cc\u0645\u062a \u062e\u0631\u06cc\u062f \u0647\u0631 \u0648\u0627\u062d\u062f ":"unit_price"})

In [46]:
def check_isnull(dataframe, columns):
    df = pd.isnull(dataframe)
    for column in columns:
        if len(df.groupby(column).size()) == 1:
            yield {column: False}
        else:
            yield {column: True}

for col in check_isnull(df, df.columns):
    print(col)

{'day': False}
{'month': False}
{'year': False}
{'store_num': False}
{'daily_purchase': False}
{'daily_sale': False}
{'unit_price': False}


In [47]:
def map_date(x, mapped_values):
    key = '/'.join([str(x['day']), str(x['month']), str(x['year'])])
    return mapped_values[key]


def convert_to_date(df):
    temp_df = df[['day', 'month', 'year']].drop_duplicates()
    temp_df.reset_index(inplace=True, drop=True)
    mapped_date_values = temp_df.T.to_dict('list')
    for k, v in mapped_date_values.items():
        mapped_date_values[k] = '/'.join(map(str, v))
    mapped_date_values = {v: k for k, v in mapped_date_values.items()}

    df['date'] = df.apply(lambda x: map_date(x, mapped_date_values), axis=1)
    df.drop(['day', 'month', 'year'], axis=1, inplace=True)
    return mapped_date_values

mapped_date_values = convert_to_date(df)

In [48]:
from pandas import ExcelWriter

# Separate test set
df_test = df.loc[df['date'].isin([mapped_date_values['13/4/2013'], mapped_date_values['13/7/2013'],
                                  mapped_date_values['14/7/2013'], mapped_date_values['7/11/2013'],
                                  mapped_date_values['15/12/2013'], mapped_date_values['9/2/2014'],
                                  mapped_date_values['17/2/2014']])].copy()
df_test.drop('unit_price', axis=1, inplace=True)
writer = ExcelWriter('datasets/gold_test.xlsx')
df_test.to_excel(writer, 'Sheet1')
writer.save()

In [105]:
# Remove test set data 
df = df.loc[~df['date'].isin([mapped_date_values['13/4/2013'], mapped_date_values['13/7/2013'],
                              mapped_date_values['14/7/2013'], mapped_date_values['7/11/2013'],
                              mapped_date_values['15/12/2013'], mapped_date_values['9/2/2014'],
                              mapped_date_values['17/2/2014']])]

# Set NAN data 
def set_nan_unit_price_hard(x):
    if x['unit_price'] == '؟':
        x['unit_price'] = df.loc[df['date'] == x['date'], 'unit_price'].head(1).values[0]
    return x
df_train = df.apply(set_nan_unit_price_hard, axis=1)

# Export df_train 
writer = ExcelWriter('datasets/gold_train.xlsx')
df_train.to_excel(writer, 'Sheet1')
writer.save()

In [109]:
df[df['unit_price'] == u'؟']

Unnamed: 0,store_num,daily_purchase,daily_sale,unit_price,date
2834,134955,41200,28325,؟,14
5685,134955,36050,236900,؟,28
9214,134955,51500,309000,؟,45
13785,134955,216300,51500,؟,67
15809,134955,103000,427450,؟,77
17713,134955,329600,56650,؟,87
19584,134955,185400,247200,؟,96
19920,134955,10300,72100,؟,98
20234,134955,628300,154500,؟,99
20435,134955,216300,164800,؟,100


In [110]:
from sklearn.model_selection import train_test_split

X = df_train.loc[:, ['store_num', 'daily_purchase', 'daily_sale', 'date']]
y = df_train.loc[:, 'unit_price']

X, y = X.values, y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [111]:
from sklearn import linear_model

# df.groupby('unit_cost').size()
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [113]:
import numpy as np

y_pred = reg.predict(X_test)
print(y_pred)
n = X_test.shape[0]
print(y_test)
print("Mean squared error: %.2f" % (np.mean((y_pred - y_test) ** 2)))

[ 355813.01873164  381843.81241038  372957.59125389 ...,  370969.20020251
  372518.14216379  386914.86942423]
[358560 395280 375300 ..., 367200 375300 371520]
Mean squared error: 78594906.82


In [82]:
from keras.layers import Dense
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=6, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5)

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Using Theano backend.


Exception: Error when checking model input: expected dense_input_1 to have shape (None, 6) but got array with shape (53403, 4)