In [581]:
import pandas as pd
import numpy as np
import math
import csv
from sklearn import preprocessing
from collections import defaultdict
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [582]:
train = pd.read_csv("Train_UWu5bXk.csv")
test = pd.read_csv("Test_u94Q5KV.csv")
train_data = train.copy()
test_data = test.copy()

In [583]:
train_data['log_Item_MRP'] = train_data['Item_MRP'].apply(lambda x: np.log(x))
test_data['log_Item_MRP'] = test_data['Item_MRP'].apply(lambda x: np.log(x))

train_data['exp_Item_Visibility'] = train_data['Item_Visibility'].apply(lambda x: np.exp(x))
test_data['exp_Item_Visibility'] = test_data['Item_Visibility'].apply(lambda x: np.exp(x))

train_data['sqroot_Item_Weight'] = train_data['Item_Weight'].apply(lambda x: np.sqrt(x))
test_data['sqroot_Item_Weight'] = test_data['Item_Weight'].apply(lambda x: np.sqrt(x))

In [584]:
model_1_features = ['Item_Weight',
            'Item_Fat_Content',
            'Item_Visibility',
            'Item_Type',
            'Item_MRP',
            'Outlet_Establishment_Year',
            'Outlet_Size',
            'Outlet_Location_Type',
            'Outlet_Type',
            'log_Item_MRP',
            'exp_Item_Visibility']
features_category = ['Item_Fat_Content',
            'Item_Type',
            'Outlet_Establishment_Year',
            'Outlet_Size',
            'Outlet_Location_Type',
            'Outlet_Type']
target = ['Item_Outlet_Sales']
train_data = train_data[model_1_features + target]
test_data = test_data[model_1_features]

Following is the function to convert all categorical 'str' features to numeric categorical features

In [585]:
def dic_of_categories(data):
    print "dic_of_categories"
    dic_of_dics = {}
    for feature in features_category:
        j = 1
        keys = [np.nan]
        values = [np.nan]
        for i in data[feature].value_counts().iteritems():
            keys.append(i[0])
            values.append(j)
            j += 1
        dic1 = dict(zip(keys, values))
        dic_of_dics.setdefault(feature, dic1) 
    return dic_of_dics

In [586]:
def categorize_based_on_dic_of_categories(data, dic_of_dics):
    print "categorize_based_on_dic_of_categories"
    for feature in data.columns:
        if feature in dic_of_dics:
            #print dic_of_dics[feature]
            data[feature] = data[feature].apply(lambda x: dic_of_dics[feature][x])

In [587]:
dic_of_dics = dic_of_categories(train_data[model_1_features]) 

categorize_based_on_dic_of_categories(train_data, dic_of_dics)
categorize_based_on_dic_of_categories(test_data, dic_of_dics)

print train_data.head()
print test_data.head()

dic_of_categories
categorize_based_on_dic_of_categories
categorize_based_on_dic_of_categories
   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0         9.30                 1         0.016047          5  249.8092   
1         5.92                 2         0.019278          9   48.2692   
2        17.50                 1         0.016760         10  141.6180   
3        19.20                 2         0.000000          1  182.0950   
4         8.93                 1         0.000000          3   53.8614   

   Outlet_Establishment_Year  Outlet_Size  Outlet_Location_Type  Outlet_Type  \
0                          3            1                     3            1   
1                          7            1                     1            4   
2                          3            1                     3            1   
3                          9          NaN                     1            2   
4                          2            3                    

### Imputation in train data

In [588]:
imp = preprocessing.Imputer(missing_values='NaN', strategy='median', axis=0)

imp.fit(train_data)
train_data_np = imp.transform(train_data)
train_data = pd.DataFrame(train_data_np, columns=train_data.columns)

imp.fit(test_data)
test_data_np = imp.transform(test_data)
test_data = pd.DataFrame(test_data_np, columns=test_data.columns)

In [589]:
print train_data.head()
print train_data.head()

   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0         9.30                 1         0.016047          5  249.8092   
1         5.92                 2         0.019278          9   48.2692   
2        17.50                 1         0.016760         10  141.6180   
3        19.20                 2         0.000000          1  182.0950   
4         8.93                 1         0.000000          3   53.8614   

   Outlet_Establishment_Year  Outlet_Size  Outlet_Location_Type  Outlet_Type  \
0                          3            1                     3            1   
1                          7            1                     1            4   
2                          3            1                     3            1   
3                          9            2                     1            2   
4                          2            3                     1            1   

   log_Item_MRP  exp_Item_Visibility  Item_Outlet_Sales  
0      5.520697 

In [590]:
X = train_data[model_1_features]
y = train_data[target]

In [591]:
#model_lr = LinearRegression()
#model_lr.fit(X, y)

model_ridge = Ridge(alpha=0.001)
model_ridge.fit(X, y)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [592]:
print X.shape, y.shape
print test_data.shape

(8523, 11) (8523, 1)
(5681, 11)


In [593]:
#predictions = model_lr.predict(test_data)
predictions = model_ridge.predict(test_data)
print predictions

[[ 2466.19924996]
 [ 1125.54504394]
 [ 3297.11894215]
 ..., 
 [ 1804.11973773]
 [ 3063.80727947]
 [ 1139.42371686]]


### Writing the submission file

In [594]:
with open('submission.csv', 'wb') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    submission_writer.writerow(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
    for i in xrange(len(test)):
        submission_writer.writerow([test['Item_Identifier'][i], test['Outlet_Identifier'][i], predictions[i][0]])