# Statsplots is a notebook for figures

[Formatting Data](#Formatting Data)

[CDF Plot](#CDF Plot)

[One vs All Plot](#One vs All Plot)

[Bokeh Plot](#Bokeh Plot)

[Model Validation](#Model Validation)

## Formatting Data

<a id='Formatting Data'></a>

In [1]:
import numpy as np#Math library
import pandas as pd#Table library
import matplotlib.pyplot as plt#Plotting library
import pandas as pd
import warnings
import seaborn as sns#Plotting library
warnings.filterwarnings('ignore')#Gets rid of popup warnings
%matplotlib nbagg

## One vs All Plot

<a id='One vs All Plot'></a>

In [None]:
def onevsallPlot(dataf, target):
    '''This is like the seaborn pairgrid plot (all vs all) 
    except that it only plots the features against a single
    feature (eg. target column).'''
    
    #We will first remove non-numeric columns from our dataframe
    dataNum= dataf._get_numeric_data()
    
    #Get remaining column names
    feats= dataNum.ix[:, dataNum.columns != target].columns
    
    #Get # of features to specify # of plots to generate
    numFeats= len(feats)
    
    #Using 5 columns since that's max visible in cell at 10x6 dimension
    gridCols= 5
    
    #Making # of rows 1 larger than necessary so we always have space
    #to fill remaining plots (if numFeats%gridCols != 0)
    gridRows= (numFeats//gridCols)+1
    
    #Initialize figure and axes objects
    figure, ax = plt.subplots(1,1, figsize=(10,6))
    
    #Iterate through each feature 
    for i in range(1, len(feats)):
            #Specify index of subplot in gridRows x gridCols grid
            plt.subplot(gridRows, gridCols, i)
            
            #Plot feature vs target
            plt.scatter(dataNum[feats[i]], dataNum[target], 
                marker= '.', color= '#0099ff')
            
            #Set title
            plt.title(feats[i] + ' vs ' + dataNum[target].name, fontsize= 10, fontweight= 'bold')
            
            #Set axes, limit ticks
            plt.locator_params(axis='x',nbins=4)
            plt.locator_params(axis='y',nbins=4)
            plt.tight_layout()

## CDF Plot

<a id='CDF Plot'></a>

In [2]:
def cdfPlot(numArray):
    '''Takes in an array and creates a cumulative 
    distribution function (CDF) plot.
    '''
    #Sort array, create regular intervals for indices
    sortArray= sorted(numArray)
    step= 1.0/len(numArray)
    scaledArray= np.arange(0,1,step)
    
    #Specify fig dimensions and style, then plot
    fig, ax = plt.subplots(1,1, figsize=(5,5))
    plt.style.use('fivethirtyeight')
    plt.plot(scaledArray,sortArray, color= '#0099ff')   

    #Customize appearance of plot
    plt.title('Change Title', fontsize= 21, fontweight= 'bold')
    
    plt.xlabel('Your Label Here', fontsize=17,fontweight= 'bold')
    plt.ylabel('Your Label Here', fontsize=17, fontweight= 'bold')
    ax.tick_params(axis='both', which='major', labelsize=13, labelcolor= '#7c7e82')#Adjust size of x,y labels
    ax.tick_params(axis='both', which='major', labelsize=13)

## Bokeh Plot
<a id='Bokeh Plot'></a>

The lines below will allow you to make an interactive plot.

In [3]:
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import NumeralTickFormatter

#Specify that output will be in a cell and not in a separate html file
output_notebook()

#Specify where the plot will fetch data from when it plots/hovers
#You can change this as you need to with as many key:value pairs
#in data dict as you would like
source= ColumnDataSource(data= dict(
        x= pcaFeats[:,0], y= pcaFeats[:,1],
        x0= cluster0[:,0], y0= cluster0[:,1],
        x1= cluster1[:,0], y1= cluster1[:,1],
        x2= cluster2[:,0], y2= cluster2[:,1],
        marker_size0= total_goals_for[indexCluster0]/1.5,
        marker_size1= total_goals_for[indexCluster1]/1.5,
        marker_size2= total_goals_for[indexCluster2]/1.5,
        goals= datanumLabels['total_goals_for'].values,
        title= team_names))

#Specify the Bokeh tools you'll use, here we use HoverTool
#and tooltips to specify 'Team' label and source key it
#refers to
hover= HoverTool(tooltips= [("Team", " @title")])
                            
#Initialize figure object along with attributes and tools that will be available
p = figure(plot_width=800, plot_height=600, tools=[hover, 'wheel_zoom', 'pan', 'reset'], 
           title= "Clustering of FIFA Soccer Clubs (Scaled by Goals Scored)", title_text_font_size='18pt')

#Specify plots and plot attributes. Notice that x,y refer back to 
#the source['variable'] that we specified earlier
p.circle('x0', 'y0', line_color= '#f2a02e', fill_color= '#f2a02e', fill_alpha= 0.3, source=source,
                           line_width= 4, size= 'marker_size0')
p.circle('x1', 'y1', line_color= '#28eded', fill_color= '#28eded', fill_alpha= 0.3, source=source,
                           line_width= 4, size='marker_size1')
p.circle('x2', 'y2', line_color= '#54d127', fill_color= '#54d127', fill_alpha= 0.3, source=source,
                           line_width= 4, size='marker_size2')

#Another overlayed plot and additional style attributes
p.circle(pcaFeats[:,0], pcaFeats[:,1], color= 'blue', size=3)

p.xaxis[0].formatter = NumeralTickFormatter(format="0")
p.yaxis[0].formatter = NumeralTickFormatter(format="0")

p.xaxis.axis_label = "Principal Component 1"
p.yaxis.axis_label = "Principal Compoent 2"
p.xaxis.axis_label_text_font_size = "12pt"
p.yaxis.axis_label_text_font_size = "12pt"

p.title_text_font_style = "bold"

show(p)

NameError: name 'pcaFeats' is not defined

## Formatting Data

<a id='Formatting Data'></a>