## Graph Function ##

*This function is created with an aim to help a user to create a graph without actually writing the code.*

In [6]:
def graph(data,column=None,directory=None): #Defining graph function
    import numpy as np                      #Importing various packages required
    import pandas as pd                     
    import matplotlib.pyplot as plt
    import math
    import os
    data=pd.read_csv(data)                  #Reading data from csv file using Pandas
    if np.all(column==None):                #As column parameter is considered optional,checking for value if passed in the function
        column=data[data.columns[:]]        #If no value is passed,consider all the columns of the data
    else:
        data=data[column]                   #If columns are mentioned as parameters,consider those columns only
    if directory==None:                     # If directory is not provided by the user,select the current working directory
        directory=os.getcwd()               
    else:
        os.chdir(directory)                 #If user provides the path of directory,change working directory to the given directory
    data1=data.select_dtypes(include=['float64','int64'])        #Selecting numerical variables from the data
    data2=data.select_dtypes(exclude=['float64','int64'])        #Selecting Categorical variables from the data
    for i in range(0,len(data1.columns)):                        # Plotting Box plot and histogram for each numerical variable
        data1.hist(column=data1.columns[i],grid=False)           # Histogram plot
        plt.xlabel(data1.columns[i],fontsize=12)
        plt.title("Histogram of "+data1.columns[i],fontsize=16)
        plt.savefig(data1.columns[i]+'_hist.png')                #Saving each plot in the form of .png file
        data1.boxplot(column=data1.columns[i],notch=True,grid=False) #Box plot
        plt.xlabel(data1.columns[i],fontsize=12)
        plt.title("Box Plot of "+data1.columns[i],fontsize=16)
        plt.savefig(data1.columns[i]+'_box.png')                 #Saving each plot in the form of .png file
    for i in range(0,len(data2.columns)):                       #Plotting Bar plot for each categorical variable
        data2_grp=data2.groupby(data2.columns[i]).size()        #Finding the count of each unique observation present in the variable data 
        data2_grp.plot(kind='bar',title=data2.columns[i]+'Distribution')
        plt.savefig(data2.columns[i]+'_bar.png')                #Saving each plot in the form of .png file

**Improvisation**

*Adding few more parameters so that the user can select which graph to plot, creating a new directory to save all the graphs, changing the format of column to a list, adding text box and finding correlation between numerical variables and plotting correlation plots w.r.t target variable.*

In [1]:
def graph(data,column=None,directory=None,hist=None,box=None,target=None): #Defining graph function
    import numpy as np                      #Importing various packages required
    import pandas as pd                     
    import matplotlib.pyplot as plt
    import seaborn as sns
    import math
    import os
    if os.path.isdir(directory+'\\Graphs')==True:
        os.chdir(directory)
    if '.csv' in data:
        data=pd.read_csv(data)                  #Reading data from csv file using Pandas
    elif '.json' in data:
        data=pd.read_json(data)                 #Reading data from json file using Pandas
    elif '.excel' in data:
        data=pd.read_excel(data)                #Reading data from excel file using Pandas
    else:
        return('File format not supported')     # Error message if file format is not among csv,json or excel
    if np.all(column==None):                #As column parameter is considered optional,checking for value if passed in the function
        column=data[data.columns[:]]        #If no value is passed,consider all the columns of the data
    else:
        data=data.loc[:,column]                   #If columns are mentioned as parameters,consider those columns only
    if directory==None:                     # If directory is not provided by the user,select the current working directory
        directory=os.getcwd() 
        os.mkdir('Graphs')                         # Creating a new directory to store all the graphs in it.
        os.chdir(directory+'\\Graphs')
    elif os.path.isdir(directory+'\\Graphs')==True:
        os.chdir(directory+'\\Graphs')
    else:
        os.mkdir('Graphs')
        os.chdir(directory+'\\Graphs')                 #If user provides the path of directory,change working directory to the given directory
    data1=data.select_dtypes(include=['float64','int64'])        #Selecting numerical variables from the data
    data2=data.select_dtypes(exclude=['float64','int64'])        #Selecting Categorical variables from the data
    for i in range(0,len(data1.columns)):  # Plotting Box plot and histogram for each numerical variable
        text=data1.loc[:,data1.columns[i]].describe()
        if hist==None or hist=='y' or hist=='yes':
            data1.hist(column=data1.columns[i],grid=False,figsize=(6,6),color='coral')           # Histogram plot
            plt.text(10,10,str(text),horizontalalignment='left',fontsize=10,verticalalignment='top')
            plt.xlabel(data1.columns[i],fontsize=12)
            plt.title("Histogram of "+data1.columns[i],fontsize=16)
            plt.savefig(data1.columns[i]+'_hist.png')                #Saving each plot in the form of .png file
        if box==None or box=='y' or box=='yes':
            data1.boxplot(column=data1.columns[i],notch=True,grid=False) #Box plot
            plt.xlabel(data1.columns[i],fontsize=12)
            plt.title("Box Plot of "+data1.columns[i],fontsize=16)
            plt.savefig(data1.columns[i]+'_box.png')                 #Saving each plot in the form of .png file
                
    for i in range(0,len(data2.columns)):                       #Plotting Bar plot for each categorical variable
        data2_grp=data2.groupby(data2.columns[i]).size()        #Finding the count of each unique observation present in the variable data 
        data2_grp.plot(kind='bar',title=data2.columns[i]+'Distribution')
        plt.savefig(data2.columns[i]+'_bar.png')                #Saving each plot in the form of .png file
        
    if target != None and (target in data1.columns):
        for i in range(0,len(data1.columns),5):
            sns_plot=sns.pairplot(data1,y_vars=[target],x_vars=data1.columns[i:i+len(data1.columns)])  #Plotting correlation plots with respect to target variable
            sns_plot.savefig('Correlation plots.png')   

In [2]:
graph('cars.csv',directory="C:\\Users\\Sree Soundarya\\Desktop\\machine learning",target='MPG')