In [1]:
run src/preprocessing.ipynb

/home/jovyan
/home/jovyan/Walmart_Project


### Looking at the features on the training dataset

In [2]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [3]:
def sample_training_set(features_train, test_train, n_pcnt):
    n = features_train.shape[0]*n_pcnt//100
    return n, features_train[:n], test_train[:n]


def run_model(model, model_name, n_pcnt, data, labels):

    features_train, features_test, target_train, target_test = train_test_split(data, labels, random_state=42)

    
    n, features_train_sample, target_train_sample = sample_training_set(features_train, target_train, n_pcnt)
    
    model_fit = model.fit(features_train_sample, target_train_sample)
    
    train_prediction = model_fit.predict(features_train_sample)
    
    test_prediction = model_fit.predict(features_test)
    
    return {
            'model' : model, 
            'model_name' : model_name,
            'n_pcnt' : n_pcnt,
            'n' : n, 
            'rmse_train' : np.sqrt(mean_squared_error(target_train_sample, train_prediction)),
            'rmse_test' : np.sqrt(mean_squared_error(target_test, test_prediction)),
            'mae_train' : mean_absolute_error(target_train_sample, train_prediction),
            'mae_test' : mean_absolute_error(target_test, test_prediction),
            'r2_train_score' : model.score(features_train_sample, target_train_sample),
            'r2_test_score' : model.score(features_test, target_test),
    }

In [4]:
results = run_model(Lasso(alpha=100), "variable_ranking", 10, trainset_2, target_2)
results

{'model': Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
 'model_name': 'variable_ranking',
 'n_pcnt': 10,
 'n': 31607,
 'rmse_train': 14865.425312699557,
 'rmse_test': 15028.228230883822,
 'mae_train': 9563.7436206370585,
 'mae_test': 9521.7790444655366,
 'r2_train_score': 0.57465309269257792,
 'r2_test_score': 0.55924166777361339}

### Variable Ranking by Single Feature R^2 Score

In [5]:
test_scores = []

for feature in trainset_2.columns:
    results = run_model(Lasso(), "variable_ranking", 20, trainset_2[[feature]], target_2)
    feature_test_score = results["r2_test_score"]
    
    test_scores.append({"feature" : feature, "score" : feature_test_score})

In [6]:
pd.DataFrame(test_scores).head(20)

Unnamed: 0,feature,score
0,Store_1,0.001515
1,Store_2,0.005863
2,Store_3,0.003461
3,Store_4,0.008688
4,Store_5,0.004822
5,Store_6,0.001579
6,Store_7,0.002340
7,Store_8,0.000020
8,Store_9,0.001853
9,Store_10,0.005286
