# **Import Libaries**

## Add Libaries

In [None]:
# Import OS System
import os

# import MySQl Libaries
import mysql.connector

# Import Data Libaries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

#Magic Commands
%matplotlib inline


> Add more imports above depending on the nature of analyzing that need to be done.


---



### Magic Functions

- Use these function with or throughout your data as needed.
- These are not Dunders
- For full list of Magic commands vist this link:
https://ipython.readthedocs.io/en/stable/interactive/magics.html


In [None]:

# Define an alias for a system command.
%alias
%alias_magic

#Make functions callable without having to type parentheses.
%autocall

# Make magic functions callable without having to type the initial %.
%automagic

# Load numpy and matplotlib to work interactively.
%pylab

# Set environment variables. Assumptions are that either “val” is a name in the 
# user namespace, or val is something that evaluates to a string.
%set_env

# Print the last traceback.
%tb

Time execution of a Python statement or expression.
%time

# **Connecting Data**

- Import data from different sources
- Table data into a dataframe
- CSV data
- Text File Data
- JSON Data
- Excel Data
- MySQL Data
- XML Data

> *Expand section to use for analysis; Only run what you need*

---



### **Connect CSV file**
- Use this method to handle raw csv files.

**Import CSV With Pandas**

In [None]:
#Pandas Read the CSV - only use if you have a CSV file to import into a dataframe
df=pd.read_csv(r'YOUR_FILE')

**Import CSV with Python Array**

In [None]:
import csv
csvfile = open('YOUR_FILE', 'rb')
reader = csv.DictReader(csvfile)
for row in reader:
 print row


### **Connect Text File**
- Use the method above to handle raw text files.

In [None]:
# Connect raw text file (Replace File path info below)
base_path = "/path/to/directory/holding/file/"
filename = "YOUR_FILE"
path_to_file = os.path.join(base_path, filename)
fd = open(path_to_file , 'r')

### **Connect to JSON Data**

**Import JSON With Pandas**
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html

In [None]:
df=pd.read_json(r'YOUR_FILE')

**Import JSON With Python**

In [None]:
import json
json_data = open('data-text.json').read()
data = json.loads(json_data)
for item in data:
 print item

## **Connect Excel Data**

**Import Excel With Pandas**
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

In [None]:
df = pd.read_excel('YOUR_FILE', index_col=0)  

**Import Excel With Python & Numpy**

In [None]:
from openpyxl import load_workbook
wb = load_workbook('YOUR_FILE', read_only=True)

In [None]:
# Convert to Numpy array
ws = wb.get_sheet_by_name('Sheet1')
use_col = 0  
# column index from each row to get value of
x2 = np.array([r[use_col].value for r in ws.iter_rows()])

## **Connect XML Data**

**Import XML with Pandas**

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

xml_data = open('properties.xml', 'r').read()  # Read file
root = ET.XML(xml_data)  # Parse XML

data = []
cols = []
for i, child in enumerate(root):
    data.append([subchild.text for subchild in child])
    cols.append(child.tag)

df = pd.DataFrame(data).T  # Write in DF and transpose it
df.columns = cols  # Update column names
print(df) # Check contents of Data frame 

**Import XML Data With Python & Numpy**

In [None]:
from xml.etree import ElementTree as ET
tree = ET.parse('YOUR_XML_FILE')
root = tree.getroot()
data = root.find('Data')
all_data = []

**Transform the XML tree to a dictionary**

In [None]:
for observation in data:
    record = {}
    for item in observation:

        lookup_key = item.attrib.keys()[0]

        if lookup_key == 'Numeric':
            rec_key = 'NUMERIC'
            rec_value = item.attrib['Numeric']
        else:
            rec_key = item.attrib[lookup_key]
            rec_value = item.attrib['Code']

        record[rec_key] = rec_value
    
    all_data.append(record)
print(all_data) 

## **Connecting to MySQL DB**


*Connect to ether MySQL Workbench or Azure MSSM platform to work with data sets direclty on the cloud*

In [None]:
# Import MySQL into a dataframe
import mysql.connector as sql
import pandas as pd

db_connection = sql.connect(host='localhost', database='DATABASE_NAME_URI', user='root', password='1234')

db_cursor = db_connection.cursor()

db_cursor.execute('SELECT * FROM mod3')

table_rows = db_cursor.fetchall()

df = pd.DataFrame(table_rows)

df = pd.read_sql('SELECT * FROM mod3', con=db_connection)

- Imports - import libaries needed to for data processing
- Connection - connect to local or hosted server
- Cursor - The MySQLCursor class instantiates objects that can execute operations such as SQL statements. 
- Exicute Cursor - execute SQL code to access db scehms/tables
- FetchAll() - gett all Rows for current data set
- DataFrame - add MySQL tables to Python dataframe
- Read_sql select database table for inferences' 

-- https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysqlconnection.html

## **Check Dataframe**

In [None]:
# Print head of the imported data source file (5) only first 5 rows. Increase number to get more rows

print(df.head(5))

# **Data Check**

- Use these tools to check and verify your dataframe is stable
- Describe data
- Check data type
- Test data frame

## **Describe Data**

In [None]:
# Use this to get quick stats on all columns of data
df.describe()

In [None]:
#describes the DBA column as you can see dunkin donuts is the most frequent
print(df.YOUR_COLUMN.describe())

## **Check Data Types**

In [None]:
# Check Data Types
df.dtypes

In [None]:
# Another way to check data types
print(type(df))

**Pandas** _ **dtype** _ **mapping:**

| Pandas dtype | Python type | NumPy type | Usage |
| --- | --- | --- | --- |
| object | str or mixed | string\_, unicode\_, mixed types | Text or mixed numeric and non-numeric values |
| int64 | int | int\_, int8, int16, int32, int64, uint8, uint16, uint32, uint64 | Integer numbers |
| float64 | float | float\_, float16, float32, float64 | Floating point numbers |
| bool | bool | bool\_ | True/False values |
| datetime64 | NA | datetime64[ns] | Date and time values |
| timedelta[ns] | NA | NA | Differences between two datetimes |
| category | NA | NA | Finite list of text values |

## **Test Dataframe**

In [None]:
# Get info back on core dataframe to see what your dealing with
df.info()

## **Split Screen Comparison**
- Un-comment this code below and run it to show side by side analysis of 2 dataframes, calulations, or data...
- Run test before using to see
- use 2 dataframes or 2 parts of a data frame to compare.

In [None]:

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Shows both outputs
print("Example:")
10*5
45-51


Example:


50

-6

# **Pre-proccessing**

- Find null & missing values
- Drop missing values
- fill in missing values
- Replace values
- Replace duplicate values
- Pre-proccess for modeling and test set (advancded)
- Imputation (advanced)

## **Find NULL and Missing Values**

In [None]:
#counts all null values in data frame (both column and rows)
print(df.isnull().sum().sum())


In [None]:
#counts all null values in each column
print(df.isnull().sum())

In [None]:
#show null values
print(df.isnull())


**Resources to convert and work with datatypes**

- **Overview of Pandas Data Types:**  https://pbpython.com/pandas_dtypes.html
- **Overview of Numpy Datatypes:**https://numpy.org/devdocs/user/basics.types.html

## **Drop Missing Values**

In [None]:
# Drops any row containing a missing value
df.dropna()

**Filter & Find NON-Null Values**

In [None]:
# Creates a frame with only values NO NULL values
notNull = df[df.notnull()]
print(notNull)

In [None]:
# Passing how='all' will only drop rows that are all NA:
df.dropna(how='all')

In [None]:
# To drop columns in the same way, pass axis=1:
# use comun index or column name
df['Your_Column'] = NA
# Drop specified columns with Null values
df.dropna(axis=1, how='all')


## **Fill In Missing Values**

**Fill in missing values with 0**

In [None]:
df.fillna(0)

**Fill in missing values by a single column**

In [None]:
# Change 0 to the index value of the column that needs changed
df.fillna(0, inplace=True)

**Fill in multiple select columns with predfined values**

In [None]:
# Replace the Column_Name1.. with the target column; then the values set to replace them
df.fillna(['Column_Name1','Column_Name1'],['Values_to_Replace','Values_to_Replace'],inplace=True)

**Fill in null numaric values with MEAN AVERAGE**

In [None]:
# Replaces all empty spces with the mean average of its column vectors value
# ONLY for numarical data
df.fillna(df.mean())

## **Replace Values**

**Replace values in list**

In [None]:
# replaces all values -999 &-1000 with NAN or 0
# array1 = target, array2 = replacment values

df.replace([-999, -1000], [np.nan, 0])

**Replace Null vlaues With regEx**

In [None]:
# Replace missing values with NAN
df = df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# Replace Missing Values in Numpy array with NAN
df.replace(r'', np.NaN)

**Replace Values in a single column with predefined values**

In [None]:
# Target column and replace values inplace 
# Change Column_Name to the target column and change both array values to what you want
df['Column_Name'].replace(['Value_Name','Value_Name'],[1,0],inplace=True)

**Replace Values with MEAN average value**

In [None]:
# replace the contents with your column name and target value to replace column vlaues with Mean average.
df['Column_Name'].replace('Value_to_replace', df['Column_Name'].mean(), inplace=True)

## ***Replace Duplicate Values***

**Identify Duplicates**

*Duplicated returns a boolean Series indicating whether each
row is a duplicate (has been observed in a previous row) or not*

In [None]:
# Checks for Duplicates
df.duplicated()

**Drop Duplicates**

In [None]:
# Drops all duplicate values
# Add keep='last' to keep the orginal value

df.drop_duplicates()

**Filter Duplicates By Column**

In [None]:
# Drops duplicate values by column and keeps the original value
# Use a coma [column1,column2...] to seperate list of target column names

df.drop_duplicates(['Your_Column_Name'], keep='last')

In [None]:
# Drop duplicates based on target column

df["YOUR_COLUMN"].drop_duplicates()

**Drop Duplicates by Row & Sort Values**

In [None]:
# Sort by values desending, drop duplicates in specific column while 
# keeping the first desending value (greatest value) use asccending for lest value

df = df.sort_values('YOUR_VAL', ascending=False)
df = df.drop_duplicates(subset='YOUR_COLUMN', keep='first')
print(df) # Check data

##  **Spliting Data For Models** (Advanced)

- For building test and classified models
- 2D-3D data nalysis with X & Y varibles (Scatter Charts, Machine learning models)

In [None]:
# Seperate data by index iloc [row:column,row:column]
X = df.iloc[:, 1:2].values
y = df.iloc[:, 2:3].values

## **Imputation**(Advanced)

> In this section use the code below pertianing to the data frame of interest to replace missing values, delete rows, impute data, and encode.

* In statistics, imputation is the process of replacing missing data with substituted values. When substituting for a data point, it is known as "unit imputation"; when substituting for a component of a data point, it is known as "feature  imputation"

### **Replace NULL and Missing Values Categorical Data**

 - Best used on categorical data
 - Replace numarical data with MEAN vlaues
 
 <hr>
 
 CHANGE DATAFRAME TO FIT YOUR NEEDS

In [None]:
# Clean up REPLACE any missing values MAKE SURE TO CHANGE YOUR DATAFRAME
#obj_dfS - OR -  obj_dfN
obj_df[obj_df.isnull().any(axis=1)]

In [None]:
# Preproccess Data - YOUR COLUMN
obj_df["YOUR_COLUMN"].value_counts()

In [None]:
# Fill in NULL values for column - Your_Column : Your_Value_to_be_Changed

obj_df = obj_df.fillna({"YOUR_COLUMN": "REPLACE_VALUE"})

In [None]:
#....copy code above for each column you need to even out

### **Replace NULL & Missing Fetures W/ MEAN Values - Numarical Data**

> axis=0 argument calculates the column wise mean of the dataframe so the result will be axis=1 is row wise mean so you are getting multiple values.

- Specify index vlaue which = colum of data you want to work with.
- Even if we do not specify axis = 0, 
- The method will return the mean over the index axis by default

In [None]:
# CALULATE full Dataframe MEAN  values for each column

num_df.mean(axis = 0)

In [None]:
# skip the NULL values while finding the mean 
num_df.mean(axis = 1, skipna = True)

In [None]:
#Claulating Individual row of data MEAN/Average
num_df["YOUR_COLUMN"].mean()

In [None]:
num_df['YOUR_COLUMN'].max(): For maximum value

In [None]:
 num_df['YOUR_COLUMN'].min(): For minimum value

In [None]:
# Sumerizes colum and dataframe data with stats
print (num_df.describe())

In [None]:
# get both sets of stats back
num_df['YOUR_COLUMN'].mean()
num_df.describe()

# **Proccessing Data**
- Use these tools to proccess data
- Encode categorical data
- Implement numarical data



## **Dummy Varibles**

In [None]:
# Use Pandas get_dummies function to encode categrical data into bianary representation 

dum = pd.get_dummies(df['Your_Column_Name'])
print(dum)

**Adding Dummy Variable Prefix Column**

In [None]:
# You may want to add a prefix to the columns in the indicator Data‐
# Frame, which can then be merged with the other data. get_dummies has a prefix argu‐
# ment for doing this:
dummies = pd.get_dummies(df['Your_Column_Name'], prefix='key')
df_with_dummy = df[['Your_New_Column_Name_Here']].join(dummies)
df_with_dummy


## **Process Categorical Data**

> Use this section to work with only categorical data types for pre-proccessing and inferencing data into a new data frame to work with. 

*** Pandas object is a reference to string types.***

| Pandas dtype | Code varible | Dataframe Varible|
| --- | --- | --- | 
 string  |    = 'object'   |   = obj_df


### Add Categorical Data  Into a Seperate Dataframe

In [None]:
# ADD CATEGORICAL DATA TO A NEW DATAFRAME
# Selct all categorical data objects - obj_dfs = string
obj_dfs = df.select_dtypes(include=['object']).copy()
obj_dfs.head()

In [None]:
# Do more work here

### Encoding Categorical Data

This section incudes: 
- Encoding catergrical data varibles with bianry numbers with cat.codes
- Dummy Varibles
- One Hot Encoding


In [None]:
#Count colum valuesmake sure they are evenly processed 
obj_df["YOUR_COLUMN"].value_counts()

In [None]:
#Double Check DataTypes
obj_df.dtypes

In [None]:
# Use cat.codes accessor labeling for colums in dataFrame - YOUR_COLUMN 
# YOUR_DISC = CREATE A NEW COLUM GHEAD DESCRIBTION FOR YOUR ENCODED DATA
# (COPY THIS CODE BLOCK AND ADD COLUMN DATA FOR EACH COLUMN)
obj_df["YOUR_DISC"] = obj_df["YOUR_COLUMN"].astype('category').cat.codes


In [None]:
# cHECK NEW COLUMN WAS CREATED AT END OF TABLE WITH ENCODED VARIBLE FOR ENTIRE COLUMN
obj_df.head()

In [None]:
Described Encoded data
obj_df.describe()

In [None]:
# obj_df CAN NOW BE USED WITH NUMARICAL DATA STATISITICS AND INFERENCES BELOW
# Sperate values into X and Y values for stats
# Use advanced grouping technuiqes below.

## **Process Numarical Data Types**

>  Use this section to proccess numerical data by adding it to a new dataframe.

 
 <hr>
 **Working With Numarical Data:**
 
| Pandas dtype | Code varible | Dataframe Varible|
| --- | --- | --- | 
- int64  |    = 'int'   |   = num_df
- float64   | = 'float'  |  = flt_df
- datetime64 | = 'datetime'| = dat_df
- bool      |  = 'bool'   |  = bol_df

*** Only Use the type you need and copy as many times to fit your data ***

In [None]:
# Selct all numarical  data objects - num_df = numbers
num_df = df.select_dtypes(include=['int']).copy()
num_df.head()

In [None]:
# Selct all numarical  data objects - num_df = numbers
flt_df = df.select_dtypes(include=['float']).copy()
flt_df.head()

In [None]:
# Selct all numarical  data objects - num_df = numbers
dat_df = df.select_dtypes(include=['datetime']).copy()
dat_df.head()

In [None]:
# Selct all numarical  data objects - num_df = numbers
bol_df = df.select_dtypes(include=['bool']).copy()
bol_df.head()

In [None]:
# Do more work here

## **Data Transformation**

**Transform to Uppercase**

In [None]:
df.rename(index=str.title, columns=str.upper)

**Detecting and Filtering Outliers**


In [None]:
# Filtering or transforming outliers is largely a matter of applying array operations.
# Consider a DataFrame with some normally distributed data:

df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()


In [None]:
# Suppose you wanted to find values in one of the columns exceeding 3 in absolute
# value:
# change index value or add column name as string df["column_name"]
col = df[2]
col[np.abs(col) > 3]

In [None]:
# To select all rows having a value exceeding 3 or –3, you can use the any method on a
# boolean DataFrame:

df[(np.abs(data) > 3).any(1)]

**Permutation and Random Sampling**


In [None]:
# Permuting (randomly reordering) a Series or the rows in a DataFrame 
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

## **Grouping Data**

**Grouping by Independant and dependant Columns**

In [None]:
# Creates a new dataframe grouping by specified column and sumerizing the count of variables
# This COLUMN must be the independant variable

df_group = df.groupby(['Your_Column_Name_Ind']).count()

In [None]:
# Check new Dataframe Object
df_group.head(20)

In [None]:
# Create new variable by the dependant varible
# This COLUMN must be the dependant variable

grp_sort = df_group.sort_values(by='Your_Column_name_Dep', ascending=False)[:3]

In [None]:
# Check grp_sort variable
print(grp_sort)

In [None]:
#Relable dataframes for all plots from df to dfc or what ever you decide to name it!
# EX: dfc =... just remember to change the dataframe name when plotting 
dfc = grp_sort['Your_Column_name_Dep']

In [None]:
# Plot Example:
# select dependant column feature name
dfc['Your_Column_name_Dep'].plot(kind='bar', stacked=False, figsize=[16,6], colormap='winter')

# **Ploting Data Visulization**

## Univariate Plotting With Pandas
<hr>

_Plot has methods to alter size and visualization styles such as **figplot(6,6)**
to give the size of the plot an x vlaue of 6 inches and y vlaue of 6 inches. these can be changed_
 
See dtrials of how to manipulate data with panads methods here:

- Series data = https://pandas.pydata.org/pandas-docs/stable/reference/series.html
- Datframes = https://pandas.pydata.org/pandas-docs/stable/reference/frame.html


### Bar charts and categorical data

> Bar charts are arguably the simplest data visualization. They map categories to numbers: 

- Replace df with your data fram unless you are still using orignail dataframe
- Use with both categorical and numarical data


In [None]:
# ADD CATEGORICAL DATA COLUM TO GET STATS
df['YOUR_CLOUMN'].value_counts().head(10).plot.bar()

In [None]:
# CONVERT Y AXIS TO PERCENTAGE VALUES
(df['YOUR_CLOUMN'].value_counts().head(10) / len(df)).plot.bar()

**Bar charts are very flexible: The height can represent anything, as long as it is a number. And each bar can represent anything, as long as it is a category.**

- Nominal categorical variables include things like countries, ZIP codes, types of cheese, and lunar landers. The other kind are ordinal categories:
- ordinal categories: things that do make sense to compare, like earthquake magnitudes, housing complexes with certain numbers of apartments, and the sizes of bags of chips at your local deli.

In [None]:
# USING NOMINLA OR ORDINAL NUMARICAL DATA (CAN USE ENCODED, DUMMY DATA HERE)
df['YOUR_CLOUMN'].value_counts().sort_index().plot.bar()

### Line charts

** What would we do if the magazine rated things 0-100? We'd have 100 different categories; simply too many to fit a bar in for each one! In that case, instead of bar chart, we could use a line chart: **

- Use Line chart to measureuniuqe values
- Use line chart to visualize exccess amounts of data categories

> Unlike bar charts, they're not appropriate for nominal categorical data. While bar charts distinguish between every "type" of point line charts mushes them together. So a line chart asserts an order to the values on the horizontal axis, and the order won’t make sense with some data. 

<hr>

** Suppose that we're interested in counting the following variables: **

- The number of tubs of ice cream purchased by flavor, given that there are 5 different flavors. (NOMINAL) = BAR CHART
- The average number of cars purchased from American car manufacturers in Michigan.              (NOMINAL) = BAR CHART   
- Test scores given to students by teachers at a college, on a 0-100 scale.                      (ORDINAL) = LINE CHART
- The number of restaurants located on the street by the name of the street in Lower Manhattan.  (ORDINAL) = LINE CHART

In [None]:
df['YOUR_CLOUMN'].value_counts().sort_index().plot.line()

### Area charts

> Area charts are just line charts, but with the bottom shaded in. That's it!

In [None]:
# Use numarical ordinal data
df['YOUR_CLOUMN'].value_counts().sort_index().plot.area

### Interval data

Interval variables are the wind speed in a hurricane, shear strength in concrete, and the temperature of the sun. An interval variable goes beyond an ordinal categorical variable: it has a meaningful order, in the sense that we can quantify what the difference between two entries is itself an interval variable.

For example, if I say that this sample of water is -20 degrees Celcius, and this other sample is 120 degrees Celcius, then I can quantify the difference between them: 140 degrees "worth" of heat, or such-and-such many joules of energy.

- Line charts work well for interval data. Bar charts don't—unless your ability to measure it is very limited, interval data will naturally vary by quite a lot.

- Use  histogram for interval variable in our dataset, price (we'll cut price off at 200$ a bottle; more on why shortly).


### Histograms

> Histogram is special kind of bar plot that splits your data into even intervals and displays how many rows are in each interval with bars. The only analytical difference is that instead of each bar representing a single value, it represents a range of values.

In [None]:
# Calulates average data frame row values with selcted colum data 
# based on specific column data specified by the conditions (< 100 +-)
df[df['YOUR_CLOUMN'] < 100]['YOUR_CLOUMN'].plot.hist()

**Use code below to test Histogram**

In [None]:

# Not good for skewed data (Will only show one column)
# Only use if data is numarical and not skewed!!!
df['YOUR_CLOUMN'].plot.hist()

In [None]:

# Check dataframe column (adjust (> 2000 to any number that fits your set)
df[df['YOUR_CLOUMN'] > 2000]

### Frequency Graphs

In [None]:
fig, ax = plt.subplots()
df['YOUR_CLOUMN'].value_counts().plot(ax=ax, kind='bar')

### Pie Charts

In [None]:
# Pie chart of entire dataframe
df.plot.pie(figsize=(6, 6))

In [None]:
df['YOUR_CLOUMN'].plot.pie(figsize=(6, 6))

### Bivariate Plotting With Pandas

### Scatter plot
>  Bivariate plot is essntially a scatter plot. A simple scatter plot simply maps each variable X, Y of interest to a point in two-dimensional space. 

- Plot shows and should be used to show correlations
- Must use numarical data
- Category data must be encoded to numarical data and seperated to X Y varibles
- Best results data needs to be clean and even
- Best used with smaller datasets!!!

In [None]:
# selct sample size and condition to be met (<100 +-) of fetures to compare to.
# scatter(x='YOUR_COLUMN_x', y='YOUR_COLUMN_Y') is ONLY the labels you can name them anything!!
df[df['YOUR_COLUMN_x'] < 100].sample(100).plot.scatter(x='YOUR_COLUMN_x_label', y='YOUR_COLUMN_Y_label')

### Hexplot
> A hex plot aggregates points in space into hexagons, and then colors those hexagons based on the values within them:

In [None]:
# selct sample size and condition to be met (<100 +-) of fetures to compare to.
# scatter(x='YOUR_COLUMN_x', y='YOUR_COLUMN_Y') is ONLY the labels you can name them anything!!
df[reviews['YOUR_COLUMN_x'] < 100].plot.hexbin(x='YOUR_COLUMN_x_label', y='YOUR_COLUMN_Y_label', gridsize=15)

### Stacked plots
>  A stacked chart is one which plots the variables one on top of the other. Many pandas multivariate plots expect input data to be in this format, with one categorical variable in the columns, one categorical variable in the rows, and counts of their intersections in the entries.

- Is best to make a smaller sample dataframe frame for calualtions and visulizations

In [None]:
# Plots entire dataframe
df.plot.bar(stacked=True)

### Area Plot
> Same principles as "Stacked Plot" mixed with a line chart instead of a bar chart

Like single-variable area charts, multivariate area charts are meant for nominal categorical or interval variables.

Stacked plots are visually very pretty. However, they have two major limitations.

The first limitation is that the second variable in a stacked plot must be a variable with a very limited number of possible values (probably an ordinal categorical, as here). Five different types of wine is a good number because it keeps the result interpretable; eight is sometimes mentioned as a suggested upper bound. Many dataset fields will not fit this critereon naturally, so you have to "make do", as here, by selecting a group of interest.

The second limitation is one of interpretability. As easy as they are to make, and as pretty as they look, stacked plots make it really hard to distinguish concrete values. For example, looking at the plots above, can you tell which wine got a score of 87 more often: Red Blends (in purple), Pinot Noir (in red), or Chardonnay (in green)? It's actually really hard to tell!

In [None]:
# Plots entire dataframe
df.plot.area()

### Bivariate Line Chart
>Bivariate Line Chat is highly effective Because the line in this chart takes up so little visual space, it's really easy and effective to overplot multiple lines on the same chart.

In [None]:
#Plots entire dataframe
df.plot.line()

### SeaBorn Plots

In [None]:
# Import Seaborn agian for good measmeasure 
import seaborn as sns


In [None]:
# Set parameters and themes
sns.set_theme(style="darkgrid")
plt.rcParams['figure.figsize'] = [6, 2]

**Load DataFrame into SeaBorn**

> You must load datafram and colum vectors into seaborn valrible to proccess

**Join Plots**
> URL: http://seaborn.pydata.org/generated/seaborn.jointplot.html?highlight=join%20plot

- requires x, y vectors or keys in data values
- More Seaborn plots and tutorials: http://seaborn.pydata.org/tutorial.html


In [None]:
#Purple JoinPlot (color='m')
# USE SEABORN TO PLOT JOINGRID PLOT
# Custom the inside plot: options are: “scatter” | “reg” | “resid” | “kde” | “hex”
fig.set_size_inches(11.7, 8.27)
sns.jointplot(x=obj_df["YOUR_COLUMN_X"], y=obj_df["YOUR_COLUMN_Y"], kind='reg', truncate=True, xlim=(0, 60), ylim=(0, 5), color="m", height=10,)


In [None]:
# Blue JoinPlot (edgecolor="skyblue")
# USE SEABORN TO PLOT JOINGRID PLOT
# Custom the inside plot: options are: “scatter” | “reg” | “resid” | “kde” | “hex”
fig.set_size_inches(11.7, 8.27)
sns.jointplot(x=obj_df["YOUR_COLUMN_X"], y=obj_df["YOUR_COLUMN_Y"], kind='scatter', height=15,linewidth=2, edgecolor="skyblue")


# **Advanced Statistics and Visulization**

## Multiple regresion Histogram

In [None]:
# multiple histograms

df = df.set_index(['YOUR_COLUMN'])
df.iloc[0]=0
df=df.cumsum()
    
#Create Displot loop through column data features
for column in df.columns:
    plt.figure()             
    sns.distplot(df[column])

# **Resources**

**These are some articles I found usefull for handling data qaulity:**

- https://towardsdatascience.com/all-about-missing-data-handling-b94b8b5d2184
- https://www.researchgate.net/publication/220579612_Missing_Data_Imputation_Techniques
- https://pubmed.ncbi.nlm.nih.gov/21133556/
- https://www.theanalysisfactor.com/multiple-imputation-in-a-nutshell/
- https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/
- https://pbpython.com/categorical-encoding.html
- https://numpy.org/devdocs/user/basics.types.html
- https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114
- https://ipfs.io/ipfs/bafykbzacedenbpx4yzjvefqeeqe7oh5wn4kjsosqjt7zoij2xju7ihncalct2?filename=Jacqueline%20Kazil%2C%20Katharine%20Jarmul%20-%20Data%20Wrangling%20with%20Python_%20Tips%20and%20Tools%20to%20Make%20Your%20Life%20Easier-O%27Reilly%20Media%20%282016%29.pdf