In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
customers = pd.read_csv('./data/olist_customers_dataset.csv')

In [3]:
geolocation = pd.read_csv('./data/olist_geolocation_dataset.csv')

In [4]:
order_items = pd.read_csv('./data/olist_order_items_dataset.csv')

In [5]:
order_payment = pd.read_csv('./data/olist_order_payments_dataset.csv')

In [6]:
order_review = pd.read_csv('./data/olist_order_reviews_dataset.csv')

In [7]:
orders = pd.read_csv('./data/olist_orders_dataset.csv')

In [9]:
products = pd.read_csv('./data/olist_products_dataset.csv')

In [10]:
sellers = pd.read_csv('./data/olist_sellers_dataset.csv')

In [11]:
product_category_name = pd.read_csv('./data/product_category_name_translation.csv')

In [28]:
prod_na=products._get_numeric_data()
pd.Series([variance_inflation_factor(prod_na.dropna().values, i)for i in range(prod_na.shape[1])],index=prod_na.columns)

product_name_lenght           8.403721
product_description_lenght    2.511064
product_photos_qty            2.646336
product_weight_g              2.374241
product_length_cm             6.471715
product_height_cm             3.536315
product_width_cm              7.399201
dtype: float64

In [32]:
prod_na.dropna().values.shape[1]

7

In [None]:
class OLIST:
    def __init__(data,test_type=None):
        
        self.data = data
        self.test_type=test_type
    
    def check_regression_assumptions(self,model):
        '''Check for assumptions of regression.
            
            input_params : model -statsmodels OLS that is fit.
            
            return - dataframe which checks every assumption of regression.
            
            The assumptions are 
            1. Non-linearity of response and predictor variables.
            2. Absence of correlation of error terms.
            3. Homogenity of variance or Homoscedascity.
            4. Absence of Multicollinearity among predictor variables.
            5. Absence of outliers.
            6. Normality of response and predictor variable.
        '''
        
        #Assumption 1 and 3 can be checked by plotting residual vs fitted values. Absence of Linearity means a pattern is recognised.
        # If there is funnel shaped data then there is homogenity.
        
        sns.scatterplot(y=model.resid,x=model.fittedvalues,data=self.data)
        plt.xlabel('Fitted Values')
        plt.ylabel('Residual values')
        
        #Assumption 2 Error terms should not be correlated.Utilise the durbin watson test and plot acf
        
        #plotting acf
        plt.figure()
        sm.graphics.tsa.plot_acf(model.resid,lags=20,zero=False)
        
        #using durbin watson - check if value range is btw 1.5 - 2.5. If yes, then no autocorrelation.
        
        print('Durbin Watson value = ' durbin_watson(model.resid))
        
        #Assumption 4. Absence of MultiCollinearity.Use VIF
        
        vif = pd.DataFrame()
        vif_check = self.data._get_numeric_data()[:]
        vif_check['Intercept'] = 1
        
        vif['Variables'] = vif_check.columns
        vif['Vif'] = [variance_inflation_factor(vif_check.values, i) for i in range(vif_check.shape[1])]
        print(vif)

        
        
            
            
        
        
        
        
        
        
        