In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import shapiro

In [2]:
customers = pd.read_csv('./data/olist_customers_dataset.csv')

In [3]:
geolocation = pd.read_csv('./data/olist_geolocation_dataset.csv')

In [4]:
order_items = pd.read_csv('./data/olist_order_items_dataset.csv')

In [5]:
order_payment = pd.read_csv('./data/olist_order_payments_dataset.csv')

In [6]:
order_review = pd.read_csv('./data/olist_order_reviews_dataset.csv')

In [7]:
orders = pd.read_csv('./data/olist_orders_dataset.csv')

In [9]:
products = pd.read_csv('./data/olist_products_dataset.csv')

In [10]:
sellers = pd.read_csv('./data/olist_sellers_dataset.csv')

In [11]:
product_category_name = pd.read_csv('./data/product_category_name_translation.csv')

<b> Writing Class to check assumptions of data.</b>

In [39]:
class OLIST:
    def __init__(self,data,test_type=None):
        
        self.data = data
        self.test_type=test_type
    
    def check_regression_assumptions(self,model):
        '''Check for assumptions of regression.
            
            input_params : model -statsmodels OLS that is fit.
            
            return - dataframe which checks every assumption of regression.
            
            The assumptions are 
            1. Non-linearity of response and predictor variables.
            2. Absence of correlation of error terms.
            3. Homogenity of variance or Homoscedascity.
            4. Absence of Multicollinearity among predictor variables.
            5. Absence of outliers.
            6. Normality of response and predictor variable.
        '''
        
        #Assumption 1 ,3 and 5 can be checked by plotting residual vs fitted values. Absence of Linearity means a pattern is recognised.
        # If there is funnel shaped data then there is homogenity.
        
        sns.scatterplot(y=model.resid,x=model.fittedvalues,data=self.data)
        plt.xlabel('Fitted Values')
        plt.ylabel('Residual values')
        
        #Assumption 2 Error terms should not be correlated.Utilise the durbin watson test and plot acf
        
        #plotting acf
        plt.figure()
        sm.graphics.tsa.plot_acf(model.resid,lags=20,zero=False)
        
        #using durbin watson - check if value range is btw 1.5 - 2.5. If yes, then no autocorrelation.
        
        print('Durbin Watson value = ',durbin_watson(model.resid))
        
        #Assumption 4. Absence of MultiCollinearity.Use VIF
        
        vif = pd.DataFrame()
        vif_check = self.data._get_numeric_data()[:]
        vif_check['Intercept'] = 1
        
        vif['Variables'] = vif_check.columns
        vif['Vif'] = [variance_inflation_factor(vif_check.values, i) for i in range(vif_check.shape[1])]
        print(vif)
        
        #Assumption 6, normality of predictor variables.
        
        for i in vif_check.columns:
            plt.figure()
            qqplot(vif_check[i])
            print(shapiro(vif_check[i]))
    
    
        
        

        
        
            
            
        
        
        
        
        
        
        

In [46]:
customers.merge(geolocation,left_on=['customer_zip_code_prefix'],right_on=['geolocation_zip_code_prefix'],how='inner')

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,14409,-20.509897,-47.397866,franca,SP
1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,14409,-20.497396,-47.399241,franca,SP
2,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,14409,-20.510459,-47.399553,franca,SP
3,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,14409,-20.480940,-47.394161,franca,SP
4,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,14409,-20.515413,-47.398194,franca,SP
...,...,...,...,...,...,...,...,...,...,...
15083450,d9110683c7a282144e9fc97660026a28,5cbfdb85ec130898108b32c50d619c39,74980,aparecida de goiania,GO,74980,-16.821866,-49.244027,aparecida de goiania,GO
15083451,d9110683c7a282144e9fc97660026a28,5cbfdb85ec130898108b32c50d619c39,74980,aparecida de goiania,GO,74980,-16.821866,-49.244027,aparecida de goiania,GO
15083452,d9110683c7a282144e9fc97660026a28,5cbfdb85ec130898108b32c50d619c39,74980,aparecida de goiania,GO,74980,-16.822945,-49.244615,aparecida de goiania,GO
15083453,6fb4f2354f36e554ac80141e9128f528,3cc6f2e1b9199837fabb35ff4bf24884,99043,passo fundo,RS,99043,-28.226596,-52.467505,passo fundo,RS


In [43]:
geolocation

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS
