In [1]:
cd

/home/jovyan


In [2]:
cd "Walmart_Project"

/home/jovyan/Walmart_Project


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
import scipy.stats as st
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#formulas to test out empty and null values
def empty_count(data, feature):
    empty_mask = data[feature].isnull()
    empty_count = len(data[feature][empty_mask])
    return empty_count

def empty_count_total(data):
    for feature in data.columns:
        empty_count1 = empty_count(data,feature)
        if empty_count1 > 0:
            print(feature, empty_count1)
            
def empty_feature(data):
    list_feature=[]
    for feature in data.columns:
        empty_count1 = empty_count(data,feature)
        if empty_count1 > 0:
            list_feature.append(feature)
    return list_feature

#import csv
train_df = pd.read_csv("data/train_full.csv")
test_df = pd.read_csv("data/test_full.csv")

train_df.drop("Unnamed: 0", axis = 1, inplace=True)
test_df.drop("Unnamed: 0", axis = 1, inplace=True)

train_df.IsHoliday = train_df.IsHoliday.astype(int)

#change to category: 
train_df.Store = train_df.Store.astype("category")
test_df.Store  = test_df.Store.astype("category")

train_df.Date = train_df.Date.astype("category")
test_df.Date  = test_df.Date.astype("category")

train_df.Dept = train_df.Dept.astype("category")
test_df.Dept  = test_df.Dept.astype("category")

train_df.IsHoliday = train_df.IsHoliday.astype("category")
test_df.IsHoliday  = test_df.IsHoliday.astype("category")

train_df.Type = train_df.Type.astype("category")
test_df.Type  = test_df.Type.astype("category")

#include Date as a numeric dataset later. 
train_df.Date = pd.to_datetime(train_df.Date)
test_df.Date = pd.to_datetime(test_df.Date)

#add year, date, and month columns
train_df["Year"] = pd.to_datetime(train_df["Date"], format="%Y-%m-%d").dt.year
test_df["Year"] = pd.to_datetime(test_df["Date"], format="%Y-%m-%d").dt.year

train_df["Month"] = pd.to_datetime(train_df["Date"], format="%Y-%m-%d").dt.month
test_df["Month"] = pd.to_datetime(test_df["Date"], format="%Y-%m-%d").dt.month

train_df["Day"] = pd.to_datetime(train_df["Date"], format="%Y-%m-%d").dt.day
test_df["Day"] = pd.to_datetime(test_df["Date"], format="%Y-%m-%d").dt.day

#add Year, Month, and Day as category
train_df["Year"] = train_df["Year"].astype("category")
test_df["Year"] = test_df["Year"].astype("category")

train_df["Month"] = train_df["Month"].astype("category")
test_df["Month"] = test_df["Month"].astype("category")

train_df["Day"] = train_df["Day"].astype("category")
test_df["Day"] = test_df["Day"].astype("category")

#Replace empty nulls with training set's mean. 
train_df["MarkDown1"] = train_df["MarkDown1"].fillna(train_df["MarkDown1"].mean())
train_df["MarkDown2"] = train_df["MarkDown2"].fillna(train_df["MarkDown2"].mean())
train_df["MarkDown3"] = train_df["MarkDown3"].fillna(train_df["MarkDown3"].mean())
train_df["MarkDown4"] = train_df["MarkDown4"].fillna(train_df["MarkDown4"].mean())
train_df["MarkDown5"] = train_df["MarkDown5"].fillna(train_df["MarkDown5"].mean())

#test_df make sure to filter the null values with the average from the training set. 
test_df["MarkDown1"] = test_df["MarkDown1"].fillna(train_df["MarkDown1"].mean())
test_df["MarkDown2"] = test_df["MarkDown2"].fillna(train_df["MarkDown2"].mean())
test_df["MarkDown3"] = test_df["MarkDown3"].fillna(train_df["MarkDown3"].mean())
test_df["MarkDown4"] = test_df["MarkDown4"].fillna(train_df["MarkDown4"].mean())
test_df["CPI"] = test_df["CPI"].fillna(train_df["CPI"].mean())
test_df["Unemployment"] = test_df["Unemployment"].fillna(train_df["Unemployment"].mean())

#numeric data
numeric_train_df = train_df.select_dtypes(exclude = "category")
numeric_test_df = test_df.select_dtypes(exclude = "category")

#target data
target = numeric_train_df["Weekly_Sales"]
numeric_train_df.drop("Weekly_Sales", axis=1, inplace=True)

#drop Date for numeric train and test as it is NOT a numeric.
numeric_train_df.drop("Date", axis=1, inplace=True)
numeric_test_df.drop("Date", axis=1, inplace=True)

#categorical data
categorical_train_df = train_df.select_dtypes(include="category")
categorical_train_encoded_df = pd.get_dummies(categorical_train_df)
categorical_test_df = train_df.select_dtypes(include="category")
categorical_test_encoded_df = pd.get_dummies(categorical_test_df)

#categorical significant
categorical_train_encoded_stats = pd.DataFrame()
categorical_train_encoded_stats["mean"] = categorical_train_encoded_df.mean()
categorical_train_encoded_stats["std"] = categorical_train_encoded_df.std()
categorical_train_encoded_stats["var"] = categorical_train_encoded_df.var()
categorical_train_encoded_stats.sort_values("var", ascending=False).head()

categorical_test_encoded_stats = pd.DataFrame()
categorical_test_encoded_stats["mean"] = categorical_test_encoded_df.mean()
categorical_test_encoded_stats["std"] = categorical_test_encoded_df.std()
categorical_test_encoded_stats["var"] = categorical_test_encoded_df.var()
categorical_test_encoded_stats.sort_values("var", ascending=False).head()

categorical_train_significant = categorical_train_encoded_stats[categorical_train_encoded_stats["var"] > .20].index
categorical_train_encoded_sig_df = categorical_train_encoded_df[categorical_train_significant]

categorical_test_significant = categorical_test_encoded_stats[categorical_test_encoded_stats["var"] > .20].index
categorical_test_encoded_sig_df = categorical_test_encoded_df[categorical_test_significant]

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

#Replacing Negative numbers in order to boxcox/log the data for normalization. 
numeric_train_df = numeric_train_df + (np.abs(numeric_train_df.min().min())) + 1
numeric_test_df = numeric_test_df + (np.abs(numeric_test_df.min().min())) + 1

#Log to normalize
numeric_train_log_df = np.log(numeric_train_df)
numeric_test_log_df = np.log(numeric_test_df)
    
#Standarizing log numeric data
scaler = StandardScaler()
scaler.fit(numeric_train_log_df)

numeric_train_log_sc = scaler.transform(numeric_train_log_df)
numeric_test_log_sc = scaler.transform(numeric_test_log_df)

numeric_train_log_sc_df = pd.DataFrame(numeric_train_log_sc, columns=numeric_train_log_df.columns)
numeric_test_log_sc_df = pd.DataFrame(numeric_test_log_sc, columns = numeric_test_log_df.columns)

#Gelman log numeric data
numeric_train_log_gel_df = (numeric_train_log_df - numeric_train_log_df.mean()) / (2* numeric_train_log_df.std())
numeric_test_log_gel_df = (numeric_test_log_df - numeric_test_log_df.mean()) / (2* numeric_test_log_df.std())

#Removing Outlier for training dataset
def display_outliers(data, feature, params=1.5):
    Q1 = np.percentile(data[feature], 25)
    Q3 = np.percentile(data[feature], 75)
    tukey_window = params*(np.abs(Q1-Q3))
    less_than_Q1 = data[feature] < Q1 - tukey_window
    greater_than_Q3 = data[feature] > Q3 + tukey_window
    tukey_mask = (less_than_Q1 | greater_than_Q3)
    return data[tukey_mask]

from collections import Counter
def multiple_outliers(data, count = 2):
    raw_outliers = []
    for col in data:
        outliers_df = display_outliers(data, col)
        raw_outliers = raw_outliers + list(outliers_df.index)
    outlier_count = Counter(raw_outliers)
    outliers = [k for k, v in outlier_count.items() if v >= 6]
    return outliers

numeric_train_log_sc_out_rem_df = numeric_train_log_sc_df.drop(multiple_outliers(numeric_train_log_sc_df,6))
numeric_train_log_gel_out_rem_df = numeric_train_log_gel_df.drop(multiple_outliers(numeric_train_log_sc_df,6))
categorical_train_encoded_sig_out_rem_df = categorical_train_encoded_sig_df.drop(multiple_outliers(numeric_train_log_sc_df,6))
target_out_rem = target.drop(multiple_outliers(numeric_train_log_sc_df,6))

#PCA transformation. Fir the training dataset to NOT show test dataset results. 
pca_log_sc_out_rem = PCA(3)
pca_log_gel_out_rem = PCA(3)

pca_log_sc_out_rem.fit(numeric_train_log_sc_out_rem_df)
pca_log_gel_out_rem.fit(numeric_train_log_gel_out_rem_df)


numeric_train_log_sc_out_rem_pca_df = pd.DataFrame(pca_log_sc_out_rem.transform(numeric_train_log_sc_out_rem_df), 
                                                     columns = ["PC 1", "PC 2", "PC 3"], index=numeric_train_log_sc_out_rem_df.index)
numeric_train_log_gel_out_rem_pca_df = pd.DataFrame(pca_log_gel_out_rem.transform(numeric_train_log_gel_out_rem_df), 
                                                     columns = ["PC 1", "PC 2", "PC 3"], index=numeric_train_log_gel_out_rem_df.index)
numeric_test_log_sc_pca_df = pd.DataFrame(pca_log_sc_out_rem.transform(numeric_test_log_sc_df),
                                                     columns = ["PC 1", "PC 2", "PC 3"], index=numeric_test_log_sc_df.index)
numeric_test_log_gel_pca_df = pd.DataFrame(pca_log_gel_out_rem.transform(numeric_test_log_gel_df),
                                                     columns = ["PC 1", "PC 2", "PC 3"], index=numeric_test_log_gel_df.index)

#Merge Dataset - Note: When you merge dataset, check and make sure the index numbers are correct. 
trainset_1 = pd.merge(categorical_train_encoded_sig_out_rem_df, numeric_train_log_sc_out_rem_df, left_index=True, right_index=True)
trainset_2 = pd.merge(trainset_1, numeric_train_log_sc_out_rem_pca_df, left_index=True, right_index=True)
testset_1 = pd.merge(numeric_test_log_sc_df, categorical_test_encoded_sig_df, left_index=True, right_index=True)
testset_2 = pd.merge(testset_1, numeric_test_log_sc_pca_df, left_index=True, right_index=True)
target_1 = target_out_rem
target_2 = target_out_rem

#Del to make some room
del train_df
del test_df
del Counter
del display_outliers
del multiple_outliers