#Trying out feature scaling

In [112]:
import pandas as pd
import numpy as np
import math
import csv
from sklearn import preprocessing
from collections import defaultdict
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [129]:
train = pd.read_csv("Train_UWu5bXk.csv")
test = pd.read_csv("Test_u94Q5KV.csv")
train_data = train.copy()
test_data = test.copy()
train_data.head()
test_data.head()
print train_data.describe()

       Item_Weight  Item_Visibility     Item_MRP  Outlet_Establishment_Year  \
count  7060.000000      8523.000000  8523.000000                8523.000000   
mean     12.857645         0.066132   140.992782                1997.831867   
std       4.643456         0.051598    62.275067                   8.371760   
min       4.555000         0.000000    31.290000                1985.000000   
25%       8.773750         0.026989    93.826500                1987.000000   
50%      12.600000         0.053931   143.012800                1999.000000   
75%      16.850000         0.094585   185.643700                2004.000000   
max      21.350000         0.328391   266.888400                2009.000000   

       Item_Outlet_Sales  
count        8523.000000  
mean         2181.288914  
std          1706.499616  
min            33.290000  
25%           834.247400  
50%          1794.331000  
75%          3101.296400  
max         13086.964800  


In [114]:
features = ['Item_Weight',
            'Item_Fat_Content',
            'Item_Visibility',
            'Item_Type',
            'Item_MRP',
            'Outlet_Establishment_Year',
            'Outlet_Size',
            'Outlet_Location_Type',
            'Outlet_Type']
features_category = ['Item_Fat_Content',
            'Item_Type',
            'Outlet_Establishment_Year',
            'Outlet_Size',
            'Outlet_Location_Type',
            'Outlet_Type']
target = ['Item_Outlet_Sales']
train_data = train_data[features + target]
test_data = test_data[features]

Following is the function to convert all categorical 'str' features to numeric categorical features

In [115]:
def dic_of_categories(data):
    dic_of_dics = {}
    for feature in features_category:
        j = 1
        keys = [np.nan]
        values = [np.nan]
        for i in data[feature].value_counts().iteritems():
            keys.append(i[0])
            values.append(j)
            j += 1
        dic1 = dict(zip(keys, values))
        dic_of_dics.setdefault(feature, dic1) 
    return dic_of_dics

In [116]:
def categorize_based_on_dic_of_categories(data, dic_of_dics):
    for feature in data.columns:
        if feature in dic_of_dics:
            #print dic_of_dics[feature]
            data[feature] = data[feature].apply(lambda x: dic_of_dics[feature][x])

In [117]:
dic_of_dics = dic_of_categories(train_data[features]) 

categorize_based_on_dic_of_categories(train_data, dic_of_dics)
categorize_based_on_dic_of_categories(test_data, dic_of_dics)

print train_data.head()
print test_data.head()

   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0         9.30                 1         0.016047          5  249.8092   
1         5.92                 2         0.019278          9   48.2692   
2        17.50                 1         0.016760         10  141.6180   
3        19.20                 2         0.000000          1  182.0950   
4         8.93                 1         0.000000          3   53.8614   

   Outlet_Establishment_Year  Outlet_Size  Outlet_Location_Type  Outlet_Type  \
0                          3            1                     3            1   
1                          7            1                     1            4   
2                          3            1                     3            1   
3                          9          NaN                     1            2   
4                          2            3                     1            1   

   Item_Outlet_Sales  
0          3735.1380  
1           443.4228  
2    

### Imputation in train data

In [118]:
imp = preprocessing.Imputer(missing_values='NaN', strategy='median', axis=0)

imp.fit(train_data)
train_data_np = imp.transform(train_data)
train_data = pd.DataFrame(train_data_np, columns=train_data.columns)

imp.fit(test_data)
test_data_np = imp.transform(test_data)
test_data = pd.DataFrame(test_data_np, columns=test_data.columns)

In [119]:
print train_data.head()
print train_data.head()

   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0         9.30                 1         0.016047          5  249.8092   
1         5.92                 2         0.019278          9   48.2692   
2        17.50                 1         0.016760         10  141.6180   
3        19.20                 2         0.000000          1  182.0950   
4         8.93                 1         0.000000          3   53.8614   

   Outlet_Establishment_Year  Outlet_Size  Outlet_Location_Type  Outlet_Type  \
0                          3            1                     3            1   
1                          7            1                     1            4   
2                          3            1                     3            1   
3                          9            2                     1            2   
4                          2            3                     1            1   

   Item_Outlet_Sales  
0          3735.1380  
1           443.4228  
2    

### Feature Scaling

In [120]:
features
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
train_data_scaled = min_max_scaler.fit_transform(train_data[features])
train_data[features] = pd.DataFrame(train_data_scaled, columns=features)

test_data_scaled = min_max_scaler.fit_transform(test_data[features])
test_data[features] = pd.DataFrame(test_data_scaled, columns=features)

In [121]:
print train_data
print test_data

      Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0       -0.434951              -1.0        -0.902267  -0.466667  0.855014   
1       -0.837452              -0.5        -0.882590   0.066667 -0.855863   
2        0.541530              -1.0        -0.897926   0.200000 -0.063423   
3        0.743971              -0.5        -1.000000  -1.000000  0.280187   
4       -0.479012              -1.0        -1.000000  -0.733333 -0.808391   
5       -0.304555              -0.5        -1.000000  -0.200000 -0.829279   
6        0.083060              -0.5        -0.922403  -0.866667 -0.776155   
7       -0.041977              -1.0        -0.223670  -0.866667 -0.350826   
8        0.386722              -0.5        -0.898370  -0.600000 -0.442419   
9        0.743971              -0.5        -0.424773  -0.600000  0.328798   
10      -0.137243              -1.0        -1.000000  -1.000000 -0.879030   
11       0.660613              -0.5        -0.723112  -0.466667 -0.042267   

In [122]:
#train_data['log_Item_MRP'] = train_data['Item_MRP'].apply(lambda x: np.log(x))
#test_data['log_Item_MRP'] = test_data['Item_MRP'].apply(lambda x: np.log(x))

#train_data['sqrt_Item_MRP'] = train_data['Item_MRP'].apply(lambda x: np.sqrt(x))
#test_data['sqrt_Item_MRP'] = test_data['Item_MRP'].apply(lambda x: np.sqrt(x))

#train_data['cube_root_Item_MRP'] = train_data['Item_MRP'].apply(lambda x: math.pow(x, float(1)/3))
#test_data['cube_root_Item_MRP'] = test_data['Item_MRP'].apply(lambda x: math.pow(x, float(1)/3))

#train_data['4th_root_Item_MRP'] = train_data['Item_MRP'].apply(lambda x: math.pow(x, float(1)/4))
#test_data['4th_root_Item_MRP'] = test_data['Item_MRP'].apply(lambda x: math.pow(x, float(1)/4))

#train_data['exp_Item_Visibility'] = train_data['Item_Visibility'].apply(lambda x: np.exp(x))
#test_data['exp_Item_Visibility'] = test_data['Item_Visibility'].apply(lambda x: np.exp(x))

#train_data['sqroot_Item_Weight'] = train_data['Item_Weight'].apply(lambda x: np.sqrt(x))
#test_data['sqroot_Item_Weight'] = test_data['Item_Weight'].apply(lambda x: np.sqrt(x))

#train_data['Total_Item_Visibility'] = train_data['Item_Visibility'] * train_data['Outlet_Size']
#test_data['Total_Item_Visibility'] = test_data['Item_Visibility'] * train_data['Outlet_Size']

#train_data['Total_2_Item_Visibility'] = train_data['Total_Item_Visibility'].apply(lambda x: np.square(x))
#test_data['Total_2_Item_Visibility'] = test_data['Total_Item_Visibility'].apply(lambda x: np.square(x))

In [123]:
#model1_features = features + ['sqrt_Item_MRP', 'cube_root_Item_MRP', '4th_root_Item_MRP', 'exp_Item_Visibility']
#model1_features = features

In [124]:
#X = train_data[model1_features]
X = train_data[features]
y = train_data[target]

In [125]:
#model_lr = LinearRegression()
#model_lr.fit(X, y)

model_ridge = Ridge(alpha=0.001)
model_ridge.fit(X, y)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [126]:
print X.shape, y.shape
print test_data.shape

(8523, 9) (8523, 1)
(5681, 9)


In [127]:
#predictions = model_lr.predict(test_data)
predictions = model_ridge.predict(test_data[model1_features])
print predictions

[[ 2540.58614077]
 [ 1111.58172312]
 [ 3242.02253726]
 ..., 
 [ 1744.90615508]
 [ 3166.97551271]
 [ 1067.79638633]]


### Writing the submission file

In [128]:
with open('submission.csv', 'wb') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    submission_writer.writerow(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
    for i in xrange(len(test)):
        submission_writer.writerow([test['Item_Identifier'][i], test['Outlet_Identifier'][i], predictions[i][0]])