# User Defined Functions
***
This scripts contains reusable functions for data science (Read sub-sections of each functions for detailed information):
> 
**```Key Highlights ```**

    0. Loading basic libraries  
    1. Functions for 
        * Read dataset based on file extensions
        * Correlation Matrix for categorical features
        * Correlation Matrix for all features present in dataset

**``` Step0: Load basic libraries ```**

In [1]:
#To make cells interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import re
import random
import pandas as pd
pd.set_option("display.max_columns",100)
import numpy as np
import seaborn as sns
import sklearn as sk
import scipy.stats as ss
import itertools
import math

from datetime import datetime, timedelta
from pandas import ExcelWriter

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

**``` Function 1: Read Dataset ```**

In [2]:
def read_pandas(file_name): 
    import pandas as pd
    from pathlib import Path
    """Read DataFrame based on the file extension. This function is used when the file is in a standard format.
    Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)

    Args:
        file_name: the file to read

    Returns:
        DataFrame
    Eg: type1-
        path= "C:/users/data input/"
        filename= "Sample_data.csv"
        df = read_pandas(f'{path}{filename}')
        
        type2-
        filename= "C:/users/data input/Sample_data.csv"
        df = read_pandas(f'{filename})
        
        type3-
        filename= "C:/users/data input/Sample_data.csv"
        df = read_pandas(filename)
        
    Notes:
        This function is based on pandas IO tools:
        https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
        https://pandas.pydata.org/pandas-docs/stable/reference/io.html

        This function is not intended to be flexible or complete. The main use case is to be able to read files without
        user input, which is currently used in the editor integration. For more advanced use cases, the user should load
        the DataFrame in code.
    """
    extension = Path(file_name).suffix.lower()
    if extension == ".json":
        df = pd.read_json(str(file_name))
    elif extension == ".jsonl":
        df = pd.read_json(str(file_name), lines=True)
    elif extension == ".dta":
        df = pd.read_stata(str(file_name))
    elif extension == ".tsv":
        df = pd.read_csv(str(file_name), sep="\t")
    elif extension in [".xls", ".xlsx"]:
        df = pd.read_excel(str(file_name))
    elif extension in [".hdf", ".h5"]:
        df = pd.read_hdf(str(file_name))
    elif extension in [".sas7bdat", ".xpt"]:
        df = pd.read_sas(str(file_name))
    elif extension == ".parquet":
        df = pd.read_parquet(str(file_name))
    elif extension in [".pkl", ".pickle"]:
        df = pd.read_pickle(str(file_name))
    else:
        if extension != ".csv":
            warn_read(extension)

        df = pd.read_csv(str(file_name))
        
    return df

**``` Function 2: Reorder Columns in PD DataFrame ```**

In [3]:
def reorderColumns(df,neworder):
    """ Reorder columns based on the names
    args: 
        df- dataframe
        neworder- list/vector of columns names which you want at start
    Returns:
        DataFrame
    E.g.:
        df= reorderColumns(df,['Index','TableID'])
    """ 
    cols = df.columns.values
    newCols= list(neworder)
    newCols.extend([item for item in cols if item not in newCols])
    df=df[newCols]
    return(df)

**``` Function 3: Correlation Matrix only for categorical features ```**

In [4]:
#Function to calculate correlatiob between categorical variables

def cramers_v1(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


#Generate correlation matrix using cramers_v1 function
def cat_correl_matrix(df, col_list = None):
    """ calculate correlation matrix using Cramers V statistic based on cramers_v1 for categorial-categorial association.
    args:
        df - dataframe
        col_list - varibale list for which correlation to be calculated
    E.g. 
        cat_correl_matrix(data,['var1','var2'])
                        or
        cat_correl_matrix(data)

    """

    if (col_list == None ):
    #select features for which correlations needs to be calculated
     cat_col = df.select_dtypes(['category']).columns

    else : 
        cat_col = col_list

    if (len(cat_col) == 0) : 
        return (print('* Categoical columns are not present in input dataset.'+ str('\n')+ 
                      '* Please change datatypes to categorical for required features'))
    else :

        correl_mat =pd.DataFrame(data='',index=cat_col,columns=cat_col)
        #calculating correlation matrix
        for i in range(len(cat_col)):
            for j in range(i):
                confusion_matrix = pd.crosstab(df[cat_col[i]], df[cat_col[j]]).as_matrix()
                correl_mat.iloc[i,j]= round(100*cramers_v1(confusion_matrix),2)
        #Output 
        print("Correlation Matrix of categorical variables are:-")
        return correl_mat

**``` Function 4: Correlation Matrix and heatmap for all features in dataset ```**

In [5]:
#Correaltion Matrix and heatmap 

def convert(data, to):
    converted = None
    if to == 'array':
        if isinstance(data, np.ndarray):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values
        elif isinstance(data, list):
            converted = np.array(data)
        elif isinstance(data, pd.DataFrame):
            converted = data.as_matrix()
    elif to == 'list':
        if isinstance(data, list):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values.tolist()
        elif isinstance(data, np.ndarray):
            converted = data.tolist()
    elif to == 'dataframe':
        if isinstance(data, pd.DataFrame):
            converted = data
        elif isinstance(data, np.ndarray):
            converted = pd.DataFrame(data)
    else:
        raise ValueError("Unknown data conversion: {}".format(to))
    if converted is None:
        raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
    else:
        return converted
    
def conditional_entropy(x, y):
    """
    Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
    :param x: list / NumPy ndarray / Pandas Series
        A sequence of measurements
    :param y: list / NumPy ndarray / Pandas Series
        A sequence of measurements
    :return: float
    """
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
                          return_results = False, **kwargs):
    """
    Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
    continuous features using:
     - Pearson's R for continuous-continuous cases
     - Correlation Ratio for categorical-continuous cases
     - Cramer's V or Theil's U for categorical-categorical cases
    :param dataset: NumPy ndarray / Pandas DataFrame
        The data-set for which the features' correlation is computed
    :param nominal_columns: string / list / NumPy ndarray
        Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
        columns are categorical, or None (default) to state none are categorical
    :param mark_columns: Boolean (default: False)
        if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
        continuous), as provided by nominal_columns
    :param theil_u: Boolean (default: False)
        In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
    :param plot: Boolean (default: True)
        If True, plot a heat-map of the correlation matrix
    :param return_results: Boolean (default: False)
        If True, the function will return a Pandas DataFrame of the computed associations
    :param kwargs:
        Arguments to be passed to used function and methods
    :return: Pandas DataFrame
        A DataFrame of the correlation/strength-of-association between all features
    :E.g. 
    results = associations(df,nominal_columns='all',return_results=True)
    """

    dataset = convert(dataset, 'dataframe')
    columns = dataset.columns
    if nominal_columns is None:
        nominal_columns = list()
    elif nominal_columns == 'all':
        nominal_columns = columns
    corr = pd.DataFrame(index=columns, columns=columns)
    for i in range(0,len(columns)):
        for j in range(i,len(columns)):
            if i == j:
                corr[columns[i]][columns[j]] = 1.0
            else:
                if columns[i] in nominal_columns:
                    if columns[j] in nominal_columns:
                        if theil_u:
                            corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
                        else:
                            cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                    else:
                        cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                else:
                    if columns[j] in nominal_columns:
                        cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                    else:
                        cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
    corr.fillna(value=np.nan, inplace=True)
    if mark_columns:
        marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
        corr.columns = marked_columns
        corr.index = marked_columns
    if plot:
        plt.figure(figsize=(20,20))#kwargs.get('figsize',None))
        sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'), cmap='coolwarm')
        plt.show()
    if return_results:
        return corr


**``` Function 5: Change integer into ordinal format ```**

In [6]:
#Ordinal numbers replacement

import math
"""
This function convert integer into it's ordinal value
--dependencie import math
--args: integer number
Eg: 
    syntax- ordinal(10)
    output- '10th'
    or
    syntax- print([ordinal(n) for n in range(1,5)]) #for 
    output- ['1st', '2nd', '3rd', '4th']
"""
ordinal= lambda n: "%d%s" % (n,"tsnrhtdd"[(math.floor(n/10)%10!=1)*(n%10<4)*n%10::4])

"\nThis function convert integer into it's ordinal value\n--dependencie import math\n--args: integer number\nEg: \n    syntax- ordinal(10)\n    output- '10th'\n    or\n    syntax- print([ordinal(n) for n in range(1,5)]) #for \n    output- ['1st', '2nd', '3rd', '4th']\n"

**``` Function 6: Generate Date for a period ```**

In [7]:
def dateGenerator (startDate,endDate,freq='D',missingDays=None):
    """
    args: 
        startDate- period Start Date
        endDate- period end date
        freq- Interval at which dates to be generated, default 'D' for daily
        missingDays - List of day which user wants to omit
    eg:
        dateGenerator('1-1-2017','12-31-2018',freq='3M',missingDays=['Saturday'])
                                        or
        dateGenerator('1-1-2017','12-31-2018',freq='3m',missingDays=['Saturday'])
        dateGenerator('1-1-2017','12-31-2018')
    Output:
                0
        0	2017-01-02
        1	2017-01-03
    """
    date = pd.date_range(start=startDate, end=endDate, freq=freq)
    
    if (missingDays is None):
        return(date) 
    else : 
        return(date[~date.strftime('%A').isin(missingDays)])

**``` Function 7: Basic Distribution Bar Grpah ```**

In [8]:
def barGraph(df,path):
    for column in df:
        """
        Creats bar graphs of all columns in a dataset
        args:
            df - dataFrame
            path- location where graphs will be saved
        """

        # create a figure and axis 
        _=fig, ax = plt.subplots(figsize=(8, 4));
        _=data1 = df[column].value_counts()/len(df)
        _=ax.bar(data1.index, data1.values,width=0.4, color = 'blue') 
        _=ax.set_title(str(column), y=0.9)
        _=ax.set_ylim([0,1])
        _=ax.set_ylabel('Distribution %')
        _=ax.set_xticks(np.arange(len(data1.index)))


        for rect in ax.bar(data1.index, data1.values,width=0.4):
            _=height =rect.get_height()
            _=ax.text(rect.get_x() + rect.get_width()/2.0, height, f'{np.round(100*height,1)}%' , \
                      ha='center', va='bottom',fontsize=10)
            #fontweight='bold'
        plt.savefig(f'{path}\\{column}.png', dpi=300, format='png', bbox_inches='tight')

**``` Function 8: Paste equivalent of R ```**

In [9]:
def paste(List, sep=''):
    """
    Creates a str object
    args:
        List - list or range 
        sep- seperator
    e.g: 
        Input- l=['a','b','c','d','e']
        Syntax- paste(l)
        Output- 'abcde'
        
    """
    strCombn =str()
    for i in range(len(List)):
        temp= f'{List[i]}'
        if (i==0):
            strCombn= temp
        else:
            strCombn = f'{strCombn}{sep}{temp}'
        
    return strCombn

**``` Function 9: Data Frame columns types and Null % view ```**

In [13]:
def NullSummary(df):
    """
    Gives info on columns types and number of null values
    args:
        dataFrame
    """
    print('Dataframe dimensions:', df.shape)
    tab_info=pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
    tab_info=tab_info.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'missing values'}))
    tab_info=tab_info.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.rename(index={0:'null values (%)'}))
    return(tab_info)

**``` Function 10: summary of groupby object ```**

In [None]:
def getGrpStats(group):
    """
    function that extract statistical parameters from a grouby objet
    args:
        group variables
    syntax:
        global_stats = df['AMOUNT'].groupby(df['ORG_NAME']).apply(get_stats).unstack()
    """
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean(), 'sum':group.sum()}

**``` Function 11: ```**

In [None]:
#Descriptive Analysis of dataframe
import pandas_profiling
df.profile_report(style={'full_width':True}) #output in notebook
df.profile_report().to_file(output_file="dfProfileReport.html")