In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 10.0)

In [17]:
class Information:
    def __init__(self):
        """
        This class give some brief information about the datasets.
        """
        print("Information object created")
    
    def _get_missing_values(self,data):
        """
        Find missing values of given datad
        :param data: checked its missing value
        :return: Pandas Series object
        """
        #Getting sum of missing values for each feature
        missing_values = data.isnull().sum()
        #Feature missing values are sorted from few to many
        missing_values.sort_values(ascending=False, inplace=True)

        #Returning missing values
        return missing_values

In [47]:
class Preprocess:
    def __init_(self):
        print("Preprocess object created")
        
    def remove_no_sales(self,train):
        not_open = train[(train['Open'] == 0) & (train['Sales'] != 0)]
        print("No closed store with sales: " + str(not_open.size == 0))
        train = train.loc[train['Sales'] > 0]
        
    def date_range(self,data):
        dates = pd.to_datetime(train['Date'], format="%Y%m%d:%H:%M:%S.%f").sort_values()
        dates = dates.unique()
        start_date = dates[0]
        end_date = dates[-1]
        print("Start date: ", start_date)
        print("End Date: ", end_date)
        date_range = pd.date_range(start_date, end_date).values

In [50]:
p1 = Preprocess()
train = pd.read_csv("train.csv", parse_dates=[2])
p1.remove_no_sales(train)
p1.date_range(train)

No closed store with sales: True
('Start date: ', numpy.datetime64('2013-01-01T00:00:00.000000000'))
('End Date: ', numpy.datetime64('2015-07-17T00:00:00.000000000'))


In [84]:
class Datavisualisation:
    def __init__(self):
        print ("DataVisualisation object created")
        
    def sales_per_day(self,train):
        f, ax = plt.subplots(7, sharex=True, sharey=True)
        plt.rcParams['figure.figsize'] = (10.0, 50.0)
        for i in range(1, 8):
            mask = train[train['DayOfWeek'] == i]
            ax[i - 1].set_title("Day {0}".format(i))
            ax[i - 1].scatter(mask['Customers'], mask['Sales'], label=i)

        plt.legend()
        plt.xlabel('Customers')
        plt.ylabel('Sales')
        plt.show()
        
    def sales_per_customer(self,train):
        plt.scatter(train['Customers'], train['Sales'], c=train['DayOfWeek'], alpha=0.8, cmap=plt.cm.get_cmap('plasma'))
        plt.colorbar()
        plt.xlabel('Customers')
        plt.ylabel('Sales')
        plt.show()
        
    def state_holiday(self,train):
        for i in ["0", "a", "b", "c"]:
            data = train[train['StateHoliday'] == i]
            if (len(data) == 0):
                continue
        plt.scatter(data['Customers'], data['Sales'], label=i)
        plt.legend()
        plt.xlabel('Customers')
        plt.ylabel('Sales')
        plt.show()
        
    def school_holiday(self,train):
        for i in [0, 1]:
            data = train[train['SchoolHoliday'] == i]
            if (len(data) == 0):
                continue
        plt.scatter(data['Customers'], data['Sales'], label=i)

        plt.legend()
        plt.xlabel('Customers')
        plt.ylabel('Sales')
        plt.show()
        
    def promo(self,train):
        for i in [0, 1]:
            data = train[train['Promo'] == i]
            if (len(data) == 0):
                continue
        plt.scatter(data['Customers'], data['Sales'], label=i)

        plt.legend()
        plt.xlabel('Customers')
        plt.ylabel('Sales')
        plt.show()
    def add_store(self,train,store):
        train['SalesPerCustomer'] = train['Sales'] / train['Customers']
        avg_store = train.groupby('Store')[['Sales', 'Customers', 'SalesPerCustomer']].median()
        avg_store.rename(columns=lambda x: 'Avg' + x, inplace=True)
        store = pd.merge(avg_store.reset_index(), store, on='Store')
        return store
    def store_type(self,store):
        for i in ['a', 'b', 'c', 'd']:
            data = store[store['StoreType'] == i]
            if(len(data) == 0):
                continue
            plt.scatter(data['AvgCustomers'], data['AvgSales'], label=i)
        plt.legend()
        plt.xlabel('Average Customers')
        plt.ylabel('Average Sales')
        plt.show()
    def assortment(self,store):
        for i in ['a', 'b', 'c']:
            data = store[store['Assortment'] == i]
            if (len(data) == 0):
                continue
        plt.scatter(data['AvgCustomers'], data['AvgSales'], label=i)
        plt.legend()
        plt.xlabel('Average Customers')
        plt.ylabel('Average Sales')
        plt.show()
    def promo2(self,store):
        for i in [0, 1]:
            data = store[store['Promo2'] == i]
            if (len(data) == 0):
                continue
        plt.scatter(data['AvgCustomers'], data['AvgSales'], label=i)

        plt.legend()
        plt.xlabel('Average Customers')
        plt.ylabel('Average Sales')
        plt.show()
    def fill_na_values(self,store):
        # fill NaN values
        store["CompetitionDistance"].fillna(-1)
        plt.scatter(store['CompetitionDistance'], store['AvgSales'])

        plt.xlabel('CompetitionDistance')
        plt.ylabel('Average Sales')
        plt.show()

In [86]:
class Features:
    def __init__(self):
        print ("Features object created")
    def string_to_int(self,store,train):
        store['StoreType'] = store['StoreType'].astype('category').cat.codes
        store['Assortment'] = store['Assortment'].astype('category').cat.codes
        train["StateHoliday"] = train["StateHoliday"].astype('category').cat.codes
        merged = pd.merge(train, store, on='Store', how='left')
        return merged
    def remove_nan(self,merged):
        NaN_replace = 0
        merged.fillna(NaN_replace, inplace=True)
        merged['Year'] = merged.Date.dt.year
        merged['Month'] = merged.Date.dt.month
        merged['Day'] = merged.Date.dt.day
        merged['Week'] = merged.Date.dt.week
        return merged
    def Month_Competetions(self,merged):
        # Number of months that competition has existed for
        NaN_replace = 0
        merged['MonthsCompetitionOpen'] = 12 * (merged['Year'] - merged['CompetitionOpenSinceYear']) + \
        (merged['Month'] - merged['CompetitionOpenSinceMonth'])
        merged.loc[merged['CompetitionOpenSinceYear'] == NaN_replace, 'MonthsCompetitionOpen'] = NaN_replace
        return merged
    def Weeks_promo_open(self,merged):
        # Number of weeks that promotion has existed for
        NaN_replace = 0
        merged['WeeksPromoOpen'] = 12 * (merged['Year'] - merged['Promo2SinceYear']) + \
        (merged['Date'].dt.weekofyear - merged['Promo2SinceWeek'])
        merged.loc[merged['Promo2SinceYear'] == NaN_replace, 'WeeksPromoOpen'] = NaN_replace
        return merged
    def to_int(self,merged):
        toInt = [
            'CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear',
            'Promo2SinceWeek', 
            'Promo2SinceYear', 
            'MonthsCompetitionOpen', 
            'WeeksPromoOpen'
        ]
        merged[toInt] = merged[toInt].astype(int)
        return merged