# Types of functions 

In [1]:
# Seaborn has 2 level of functions (i.e figure level and axes level function) to draw each plot

# Axes-level function:

In [2]:
# 1.Axes-level functions operate at a lower level, creating individual plots (axes) within a larger figure.
# 2.These functions typically return a single plot (or subplot) and require an existing matplotlib Axes object to plot onto.
# 3.Examples of axes-level functions in Seaborn include sns.scatterplot(), sns.barplot(), sns.boxplot(), etc.
# 4.These functions are suitable when you need to create and customize individual plots separately.

# Figure-level functions:

In [3]:
# 1.Figure-level functions operate at a higher level, creating entire figures with multiple subplots automatically.
# 2.These functions typically create complex visualizations that involve multiple plots or facets (subplots).
# 3.Examples of figure-level functions in Seaborn include sns.relplot(), sns.catplot(), sns.pairplot(), etc.
# 4.Figure-level functions are more convenient for creating complex visualizations quickly, as they handle the creation 
#     and organization of subplots automatically.

# classification of plots

In [4]:
#     1. Relational Plots: These plots are used to visualize the relationship between variables. (numerical columns)
#         Examples include scatter plots (scatterplot()), line plots (lineplot()), and joint plots (jointplot()).

#     2. Categorical Plots: These plots are used to visualize the distribution of categorical variables or the relationship
#         between categorical and numerical variables. Examples include bar plots (barplot()), count plots (countplot()), 
#         box plots (boxplot()),violin plot.

#     3. Distribution Plots: These plots are used to visualize the distribution of univariate or bivariate data.
#         Examples include histogram (histplot()), kernel density estimation (kdeplot()), and rug plot (rugplot()).

#     4. Regression Plots: These plots are used to visualize linear relationships between variables. Examples include
#         i)regplot (axes level)            ii)lmplot(figure level)

#     5. Matrix Plots: These plots are used to visualize matrices or 2D data. Examples include heatmap (heatmap()), 
#         clustermap (clustermap()), and pairplot (pairplot()).

#     6. Multi-plot Grids: These functions allow you to create multiple plots in a grid arrangement. Examples include
#         FacetGrid (FacetGrid()), PairGrid (PairGrid()), and JointGrid (JointGrid()).

#      may be derpeciated but not sure:
#     7. Timeseries Plots: These functions are specifically tailored for visualizing timeseries data. Examples include 
#         tsplot() and lineplot().

# Seaborn library Data set Documentation:

In [5]:
# https://seaborn.pydata.org/api.html

### Utility Functions -> To see the builtint available datasets in seaborn

In [6]:
sns.get_dataset_names()

# and to load the dataset the utility function is: sns.loaddataset('name_of_dataset')

NameError: name 'sns' is not defined

# Univariate and Bivariate Analysis

In [None]:
# Univariate analysis focuses on analyzing a single variable at a time.It involves examining the distribution and
# properties of that variable independently.

# Bivariate analysis involves the simultaneous analysis of two variables to determine the relationship between them.
# It helps in understanding how one variable changes concerning the other.
# Bivariate analysis can reveal associations, correlations, or dependencies between the two variables.

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px                     #Library to load data-sets

In [None]:
tips = sns.load_dataset('tips')

In [None]:
tips.head()

# 1.Relational Plot
### .) to see the statistical relation between 2 or more variables.
### .) Bivariate Analysis

# a) Scatter plot
### scatter plots are primarily used for visualizing the relationship between two numerical variables (columns)
### They help in identifying patterns, clusters, trends, correlations, or outliers in the data.

In [None]:
# Axes level function
sns.scatterplot(data=tips,x='total_bill',y='tip',hue='sex',size='sex',style='sex')

## hue for different colors(hue is used fo categorical columns) , style = for different visuals and size = for difference in size

In [None]:
# Figure level function
sns.relplot(data=tips,x='total_bill',y='tip',kind='scatter',style='sex',hue='sex',size = 'time')

# b) Line Plot

### line plots are used for showing trends or changes in data over an ordered dimension, such as time.
###  Line plots are particularly useful for visualizing trends or changes in data over time or any other ordered dimension. They are commonly used in time series analysis to track changes in variables over successive time periods.

In [None]:
gap = px.data.gapminder()
gap

In [None]:
sns.relplot(data=gap,x='year',y='lifeExp',kind='line',hue='continent',ci=None)

In [None]:
#To draw line plot for india 
ind=gap[gap['country']=='India']
sns.relplot(data=ind,x='year',y='lifeExp',kind='line')

In [None]:
# countries_df = gap[(gap['country']=="India") | (gap['country']=='Pakistan') | (gap['country']=='China')]
countries_df = gap[gap['country'].isin(['India','Brazil','Germany'])]
countries_df

In [None]:
sns.relplot(data=countries_df,x='year',y='lifeExp',kind='line',hue='country',style='continent')

# Facet plot(faceting using col and row) -> only works with figure level function 
 #### A facet plot, often referred to as a faceted plot or a small multiples plot, is a visualization technique used to create multiple plots, each representing a subset of the data, arranged in a grid-like layout.

#### Each subplot typically represents a subset of the data based on one or more categorical variables. For example, if you have a dataset with a categorical variable like "species" and a continuous variable like "sepal length," you might create a facet plot with separate subplots for each species, where each subplot shows the distribution of sepal lengths for that species.

In [None]:
sns.relplot(data=tips,x='total_bill',y='tip',kind='scatter',row='day',col='sex',hue='sex')

In [None]:
# col wrap-> In a row put only (col_wrap=3) 3 column ...Aru column aurko row ma

In [None]:
sns.relplot(data=gap,x='lifeExp',y='gdpPercap',kind='scatter',col='year',col_wrap=3)

# 2) Distribution Plots
### used of univariate analysis
### used to find out the distribution
### Range of the observation
### central tendency
### is the data bimodal ?
### Are there outliers ?

# a) Histogram (histplot)

In [None]:
sns.displot(data=tips,x='total_bill',kind='hist')

In [None]:
titanic_df = sns.load_dataset('titanic')
titanic_df

In [None]:
#Faceting using col and row
sns.displot(data=titanic_df,x='age',kind='hist',edgecolor='white',color='red',kde=True,col='sex')

# b) Kernal density estimation (kde) Plot

### Kde is smothen histplot

In [None]:
tips

In [None]:
sns.displot(data=tips,x='total_bill',kind='kde',hue='sex',fill=True,aspect=2)
# Here kde gives the density of probability(not exactly but roughly) in y axis (for eg the chances of being total_bill 20 
# is 0.05*100 = 5%)

In [None]:
sns.displot(data=titanic_df,x='age',kind='hist',edgecolor='white',color='red',kde=True)

# c) Rug Plot

### This plot isnot generally used alone,its used with other plots to support them .

In [None]:
sns.displot(data=tips,x='total_bill',kind='kde',fill=True,rug=True)


In [None]:
# Rug plot with scatter plot
sns.relplot(data=tips,x='total_bill',y='tip',kind='scatter',hue='sex')
sns.rugplot(data=tips,x='total_bill',y='tip',hue='sex')

# 3) Matrix plot

# a)Heat map 
### heatmap is only a axes level function .SO there is no any figure level function.

#### A heatmap plot is a graphical representation of data where values in a matrix are represented as colors. Heatmaps are particularly useful for visualizing the magnitude of relationships between two categorical variables or two numerical variables by assigning colors to different values.


In [None]:
temp=gap.pivot(index='country',columns='year',values='lifeExp')
temp


In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(temp)

In [None]:
# Heat map of European countries life expentency in different years
plt.figure(figsize=(15,10))
europe_temp=gap[gap['continent']=='Europe'].pivot(columns="year",index='country',values='lifeExp')
sns.heatmap(europe_temp,annot=True,linewidths=0.2,cmap='viridis')

## annot for showing numbers in cells
## linewidth for gap between cells
## cmap for colors(other options for cmap are ['summer','crest','viridis'])

# b) Cluster map(

### ->Similar to heat map
### ->cluster map internally makes clustering on data(similar columns lai group and dissimilar lai seperate)
### ->internally a machine learning algorithm hierarcharical algomorative clustering is applied to create a cluster of similar columns

In [None]:
iris= sns.load_dataset('iris')
iris

In [None]:
sns.clustermap(iris.iloc[:,[0,1,2,3]])

In [None]:
# corr()->used to calculate the correlation coefficient between two variables in a dataset. The correlation coefficient 
# measures the strength and direction of the linear relationship between two variables. It ranges from -1 to 1, where:

# 1 indicates a perfect positive linear relationship.
# -1 indicates a perfect negative linear relationship.
# 0 indicates no linear relationship.
selected=['sepal_length','sepal_width','petal_length']
c=iris[selected]
co = c.corr()
sns.heatmap(co,annot=True)

# Assignment/Task
##### https://colab.research.google.com/drive/1ssQKshkqJvIKnphx0JQfClJqt_vS67rb?usp=sharing#scrollTo=2BLv8ixLiehx -->

In [None]:
df = pd.read_csv(r"C:\Users\Prabal Kuinkel\Desktop\Data Analyst\PYTHON JUPYTERNOOTBOOK\Data sets and notes\Data sets\List of most-polluted cities by particulate matter concentration - List of most-polluted cities by particulate matter concentration(1).csv")
df.head()

In [None]:
# Problem-1: Draw a line plot of which, the x-axis is the "Year" and the y-axis is sum of "PM2.5" of two countries Iran 
#     and China.

In [None]:
t_df=df.query("(Country=='Iran') | (Country=='China')").groupby(["Country",'Year']).agg({'PM2.5':'sum'})
sns.relplot(kind='line',data=t_df,x='Year',y='PM2.5',hue='Country',aspect=2)

In [None]:
# Problem-2: Draw a histogram of the column "PM10" of which the y-axis represents the probability 
#     (see the seaborn documentation how to draw the probability).
sns.displot(data=df,x='PM10',stat='density',kind='hist')

In [None]:
# Problem-3: Draw a scatter plot where x-axis represents "PM2.5" and y-axis represents "PM10" for two countries Poland 
#         and Chile.

t2_df=df[(df['Country']=='Poland' )| (df['Country']=='Chile')]
sns.relplot(kind='scatter',data=t2_df,x='PM2.5',y='PM10',hue='Country')

In [None]:
# Problem-4: Draw a pie chart of top 5 most frequent countries.
t3_df=df['Country'].value_counts().head()
plt.pie(x=t3_df, autopct='%.1f%%',labels=t3_df.index)

In [None]:
#Problem 6-10

In [None]:
df1 = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTJh6X4_mqixWsfK9mgkllGQkKYW9Wj9kOIMGY2uYsWeS8n5np87DO-SDGQWJ1HXEnxiOVFVzYFYEcR/pub?gid=558678488&single=true&output=csv')
df1


In [None]:
# Problem-6

# Show a line plot of Total Profit for each month with below styling.

#     Dotted Line
#     Line Color Blue
#     Show Legend at top left
#     Circle Marker


In [None]:
t_df=df1.groupby('month_number').agg({'total_profit':'sum'}).reset_index()

In [None]:
sns.relplot(data=t_df,x='month_number',y='total_profit',kind='line',linestyle='dotted',marker='o')

In [None]:
# Problem-7

# Show sales of each product in march month as pie chart.

#     Show Percentage value
#     Give Title "Sales in March"
#     Explode ToothPaste with shadow
df1

In [None]:
df1.query('month_number==3').groupby(['facecream','facewash']).agg({'total_profit':'sum'}).reset_index()

# Assigm=nemnt/Task-2
####### https://colab.research.google.com/drive/1uWh7VoNYpBDzkeOhOfWiDT_fvYGxHN8B?usp=sharing

In [None]:
#  Using Gapminder Data

#     Create a scatter plot of 'gdpPercap' against 'lifeExp' for the year 2007, with the size
#     of the markers determined by 'pop' and the color determined by 'continent'.
t=gap[gap['year']==2007]
t

In [None]:
gap.query('year==2007')

In [None]:
sns.relplot(data=t,x="gdpPercap",y='lifeExp',hue='continent')

In [None]:
# problem:
# Using flights dataset of seaborn.

#     Using the "flights" dataset that comes with seaborn, create a heatmap that shows the average
#     number of passengers per month for each year.
plt.figure(figsize=(12,10))
flights_df=sns.load_dataset('flights')
t3=flights_df.groupby(['year','month']).agg({'passengers':'mean'}).reset_index()
t4=t3.pivot(columns='month',index='year',values='passengers')
sns.heatmap(data=t4,annot=True,fmt='.0F',linewidths=0.5)


In [None]:
# Problem
# Using the seaborn's flight dataset, create a clustermap to visualize the relationship between the number of passengers,
# months, and year.
sns.clustermap(data=t4)

In [None]:
insurance = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQVpcVtdYdZU4zU4-lqxt-iPHkyndDWs_aqEDUu9ZodlJ48Dku0PFgdXlj2N5RCmwXJrNtZLsI_wEVf/pub?gid=220677750&single=true&output=csv')
insurance

In [None]:
# Q-4: Draw a scatter plot based on the below conditions:

#     x-axis should be "age" and y-axis should be "bmi".
#     For hue, size and style parameters use "diabetic", "gender" and "smoker" column respectively.
#     Add title to your chart.
#     Age should be less than 70 percentiles.
#     BMI should be greater than the average value of the filtered age dataset.

insurance_temp=insurance[insurance["age"]>=0]
insurance_temp

In [None]:
percentile_70=np.percentile(insurance_temp['age'],70)
percentile_70

In [None]:
insurance_temp=insurance_temp[(insurance_temp['age']<percentile_70) | (insurance_temp['bmi']>insurance['bmi'].mean())]
insurance_temp

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=insurance,x='age',y='bmi',hue='diabetic',size='gender',style='smoker')


In [None]:
# Q-5: Draw a line plot by using the below informations

#     bloodpressure vs children
#     Blood-pressure values should be between 90 and 100. The upper and lower limit are included.
#     Show the details of "smoker".
insurance

In [None]:
insurance1=insurance.query('bloodpressure>=90 & bloodpressure <=100')
insurance1
sns.relplot(kind='line',data=insurance1,y='children',x='bloodpressure',hue='smoker',err_style=None)

In [None]:
# Draw a histogram using displot

#     based on "age" column.
#     Show details of "smoker" (hue).
#     Create 2 separate charts for the above 2 conditions based on "gender" side-by-side.
insurance
sns.displot(data=insurance,kind='hist',x='age',hue='smoker',col='smoker',row='gender',)


In [None]:
# Q-7: Draw a kde plot between "age" and "bloodpressure"
sns.displot(data=insurance,kind='kde',y='age',x='bloodpressure')

In [None]:
# Q-8: Draw a clustermap between between "age", "bmi" and "bloodpressure".
insurance

# Categorical plot
### ->Plots used for categorical data

# a) Categorical Scatter plot (one categorical data and one numerical data i.e bivariate analysis)
# i)stripplot()
# ii) Swarmplot()

# b)Categorical distribution plots
# i)boxplot()
# ii)violinplot()


# b)Categorical Estimate plots(for central tendency)
# i)barplot()
# ii)pointplot()
# iii)countplot()

In [None]:
tips

# i)stripplot()
### Similar to scatter plot but scatter plot is used for numerical columns but stripplot is used as scatter plot between a categorical data and numerical data

In [None]:
sns.catplot(data=tips,x='day',y='total_bill',kind='strip',hue='day',jitter=True)
# by default jitter is true
# we can also alter the value of jitter as (jitter=0.2)



# ii)swarmplot()
### similar to stripplot() but swarmplot() internally runs an algorithm to also show the distribution of data
### It is suitable for small size data but stripplot() is suitable for all size of data

In [None]:
sns.catplot(data=tips,x='day',y='total_bill',kind='swarm',hue='day')


In [None]:
# b)Categorical distribution plots (are for univariate analysis)
# i)boxplot()
# ii)violinplot()
tips

# i)Boxplot()

In [None]:
sns.catplot(kind='box',data=tips,x='total_bill',y='sex',hue='smoker')

In [None]:
tips.query('(total_bill>33) & (sex=="Female") &(smoker=="No")').count()

In [None]:
sns.boxplot(data=tips,x='total_bill')

# ii) violinplot (Boxplot + KDEplot)

In [None]:
sns.catplot(data=tips,kind='violin',x='total_bill',y='day',hue='sex')

In [None]:
sns.catplot(data=tips,kind='violin',x='total_bill',y='day',hue='sex',split=True)

# i) Barplot()

In [None]:
sns.barplot(data=tips,x='sex',y='total_bill',estimator='std',errorbar=None)

In [None]:
sns.catplot(data=tips,kind='bar',x='sex',y='total_bill',errorbar=None,estimator='min',hue='smoker')

# ii)Point plots
### similar to barplot . Point plot compares the difference between the bars of barplot

In [None]:
sns.catplot(data=tips,kind='point',x='sex',y='total_bill',hue='smoker')

In [None]:
sns.catplot(data=tips,kind='point',x='sex',y='total_bill',hue='smoker',errorbar=None)

# iii) countplot()

### shows the number of observations in each categories rather than computing a statistics for a second variable

In [None]:
sns.catplot(data=tips,kind='count',x='sex')

In [None]:
tips[tips['sex']=='Female'].value_counts().sum()

In [None]:
tips.info()

In [None]:
#Faceting using catplot

In [None]:
sns.catplot(data=tips,kind='box',x='total_bill',y='sex',col='smoker',row='time')

# 4. Regression Plots: These plots are used to visualize linear relationships between variables. Examples include
## i)regplot (axes level)            ii)lmplot(figure level)
### In simplest invocation,both functions draw a scattterplot of 2 variables x and y and then fit the regression model y ~ x and plot the resukting regression line and a 95 % confidence interval for that regression

In [None]:
tips

In [None]:
# hue paramenter is not available in regplot but is availabe in lmplot
sns.regplot(data=tips,x='total_bill',y='tip',ci=None)

In [None]:
sns.lmplot(data=tips,x='total_bill',y='tip',ci=None,hue='sex')

In [None]:
sns.residplot(data=tips,x='total_bill',y='tip')

# Multi plot grid

(grid means it returns a object with multiple grid/cell)

# 1) Facet grids (facetgrid())

-> allowing you to display multiple plots (usually of the same type) based on the subsets of your data ..We can do same thing using facet columns (i.e col and row) but facet grid is more flexible

# 2) Pair grids(pairplot(),pairgrid())
->Ploting Pairwise Relationship

# 3)Joint grids (jointplot(),jointgrid())

# 1) Facet grid

In [None]:
# using facet col and row
sns.catplot(data=tips,x='sex',y='total_bill',kind='violin',col='day',col_wrap=2)

In [None]:
#using facet grid
g=sns.FacetGrid(data=tips,col='day',row='time',hue='sex')
g.map(sns.violinplot,'sex','total_bill')

In [None]:
#Another Example using Facet grid
g=sns.FacetGrid(data=tips,col='sex',row='time',hue='sex')
g.map(sns.scatterplot,"total_bill","tip")

# 2) Pair grid
->first of all it automatically detects all the numerical columns in the data set and make all the possible pair(group of 2)       between these numerical column and draws scatter plot between them and incase of same column(same column in both x and y) it     draws a histogram between them


# a)Pair plot

In [None]:
iris

In [None]:
sns.pairplot(iris)
# draws a scatter plot for different columns and histogram for same columns

In [None]:
sns.pairplot(iris,hue='species')

# a)Pair grid

In [None]:
g=sns.PairGrid(iris)
g.map_offdiag(sns.scatterplot)  # Draw a scatter plot for all the non diagonal grid
g.map_diag(sns.histplot)      # Draw a histogram plot for all the diagonal grid

In [None]:
g=sns.PairGrid(iris)
g.map_offdiag(sns.scatterplot)  # Draw a scatter plot for all the non diagonal grid
g.map_diag(sns.boxplot)      # Draw a box plot for all the diagonal grid

In [None]:
#vars parameter-> to select only desired numerical column for creating pairs
g=sns.PairGrid(data=iris,vars=['sepal_length','petal_length'])
g.map_offdiag(sns.scatterplot)
g.map_diag(sns.histplot)

#g.map_upper(sns.kdeplot) -> Draw a kde plot for upper diagnoal grid
#g.map_lower(sns.lineplot) -> Draw a line plot for lower diagnoal grid

In [None]:
#Key findings and why to use pairgrid plot instead of pairplot
# ->In pairgrid plot we can change the plot that we want to draw for diagonal and non diagonal grid which provides more flexibility 
#      but in parplot the default scatter plot and histplot is drawn

# 3) joint plot

In [None]:
sns.jointplot(data=tips,x='total_bill',y='tip')