Creating funtion to help out with Module-2 Final Project. The list of finction and thier discription will be listed below


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import scipy.stats as stats
%matplotlib inline
import warnings

In [2]:
import seaborn as sns
import scipy.stats as stats

In [6]:
def corr_map(df, cutoff = 0.75):
    ''' Generates correlation mask to hide the unwanted cells from a correlation matrix. 
    required input is the df (DataFrame)'''
    
    # Set a new fig and its size
    fig, ax= plt.subplots(figsize = (12,12))
    
    # Create a corrrelation matrix for each df columns and round it to 3 sig-figs.
    corr = np.abs(df.corr().round(3))
    
    # Create a mask to hide the duplicate half of the matrix for easy comparison
    mask = np.zeros_like(corr, dtype=np.bool)
    idx = np.triu_indices_from(mask)
    mask[idx] = True
    
    # create a heat map with the help of the correlation values
    sns.heatmap(corr, annot=True, square=True, mask=mask, vmin=cutoff, cmap='Blues',
            ax=ax, linewidths=.5, cbar_kws={"shrink": .5}, cbar=True)
    
    plt.set_ylim(len(corr),-0.5,0.5)
    
    # generate a df of correlation that are higher than the cutoff value
    df_corr=df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
    df_corr = df_corr[~(df_corr['level_0'] == df_corr['level_1'])]
    df_corr['pair'] = list(zip(df_corr['level_0'], df_corr['level_1']))
    df_corr.drop(columns = ['level_0','level_1'], inplace=True)
    df_corr.set_index('pair', drop=True, inplace=True)
    df_corr.drop_duplicates(inplace=True)
    df_corr.columns = ['cc']
    print('Columns with correlation higher than {} are'.format(cutoff))
    high_corr = df_corr[df_corr['cc']>cutoff].columns
    print(high_corr)
    return df_corr, high_corr

In [None]:
def multi_linear_reg (df, drop_cols, target):
    ''' Generate a multilinear model from a DataFrame and without the drop_cols. 
    
    @params:
    df is a pd.DataFrame
    drop_cols is a list of columns to not include in the model fit
    target is the str() of our predicted columns name.
    
    @Output:
    generated linear model
    Columns used to generate the model
    Model summary
    QQ-plot for our residuals to test for normality of residual spread
    Plot to check for Homoscadasticity
    
    '''
    # generate the string of pridictor columns used in formulae
    cols = df.drop(drop_cols,axis=1).columns
    str_cols = ' + '.join(cols)
    str_cols
    
    # join our taget with our predictors str
    f = str(target)+'~'+str_cols
    
    #generate our linear model
    model = smf.ols(f,df).df
    display(model.summary())
    
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(10,10))
    
    # Checking normality of our residule errors
    resids = model.resid
    sm.graphics.qqplot(resids,stats.norm,line='45',fit=True, ax=axes[0])
    
    
    # Checking for the Homoscadasticity
    
    xs = df[target]
    axes[1].plot(xs,resids)
    axes[1].hlines(y=0, xmin=0, xmax=len(resids))
    
    ax[1].set(ylabel='Residuals',title='Homoskedasticity Check',
              xlabel=xlabel)
    
    return cols, model
    

    

In [3]:
def multi_scatterplot(df, cols, target, kind='scatter'):
    '''
     Plot scatter plots for each col against a target column.
     
     '''
    if len(col)%4 == 0:
        fig, axes = plt.subplots(nrows=len(cols)//4, ncols=4, figsize =(10,10))
        for col, ax in zip(cols, axes):
            df.plot(kind=kind, x=col, y=target, ax=ax, alpha=0.4, color='b')
    else:
        fig, axes = plt.subplots(nrows=(len(cols)//4)+1, ncols=len(cols)/2, figsize =(10,10))
        for col, ax in zip(cols, axes):
            df.plot(kind=kind, x=col, y=target, ax=ax, alpha=0.4, color='b')


In [4]:
def catagorical(df, value_range = (1,10)):
    '''
     Retrun an numpy array of catagorical columns from a dataframs.
     Criteria from the catagorical values is based on number of unique values.
     
     Input: 
     df : pandas.DataFrame()
     
     value_range (default = (1,10)) : Lower and upper range to filter in the the number of unique value per column.
                  The values are exclusive.
     '''
    
    cols_unique_value = df.nunique().sort_values()
    cat_cols = (cols_unique_value[(cols_unique_value>value_range[0]) & 
                                  (cols_unique_value<value_range[1])]).index
    return np.array(cat_cols)

In [5]:
def one_hot_endcoding(df, catagoricals):
    '''
     Create one_hot_encoded columns from the catagorical columns list.
     Retruns: a df_ohe
     '''
    
    df1 = df[catagoricals].astype('category')
    preprocessing = pd.get_dummies(df1[catagoricals], prefix=catagoricals, 
                                   drop_first=True)
    df1.drop(columns=catagoricals, inplace=True)

    df_ohe = pd.concat([df1, preprocessing], axis=1)
    return df_ohe

In [10]:
def analysis_plot(df,col,target='SalePrice'):
    '''
     Plots a joint plot against our target column. with hist/kde on the perimeter. 
     Plot scatter plots on second axes
     
    Inputs are independent variable (col), dependent variable (Target).
    output will be the fg and axes. '''
    
    # create a fig with 2 axes
    fig ,axes = plt.subplots(ncols=2,ylabel=target, title = (f'{col} vs {target}'))
    
    # First graph is a joint plot
    graph = sns.jointplot(data=df,x=col,y=target,kind='reg', ax=axes[0])
    axes[0].suptitle(f"{col} vs {target}")
    
    # second plot is a scatter plot.
    plt.scatter(col, target, data =df, alpha=0.5, )
    axes[1].set(xlabel=col, ylabel=target)
    
    return


# uses the intractive widgets to draw a plot for a selected col from a column list
# from ipywidgets import interact

# @interact
def show_plot(col):
    '''
     It calls another plot_col function. 
     This function is used in conjuction of ipwidgets.interact function. '''

    plot_col(df,col)

In [1]:
def outliers_z(data, z_cutoff = 3):
    z_calc = np.abs(stats.zscore(data, axis=1))
    z_calc = pd.DataFrame(z_calc, index = data.index,columns=data.columns)
    outliers = z_calc > z_cutoff
    return outliers

In [None]:
def norm_data():

def scale_data():

