## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat


## Read the CSV file

In [None]:
poll = pd.read_csv("../input/uspollution/pollution_us_2000_2016.csv")

## Information about the sheet

In [None]:
poll.info


### we can use .head also to get the same results as .info

In [None]:
poll.head

## To obtain data types of the columns

In [None]:
poll.dtypes

## identifying all the null values from the attributes

In [None]:
poll.isna().sum()

## count of the records in the columns

In [None]:
poll.count()

## Removing all the unnecessary columns

In [None]:
poll = poll.drop(['Unnamed: 0','State Code','County Code','Site Num','Address','NO2 Units','O3 Units','SO2 Units','CO Units'],axis=1)
poll.head()

Some entries have several values for the same observation date. As there's no specific explanation for these duplications nor answers to questions to the forum, I'll get the mean values for each date and location (state in the case below).

In [None]:
## Prepare all 4 AQIs against state and date 
pollSt = poll[['State','Date Local','NO2 AQI','O3 AQI','SO2 AQI','CO AQI']]
pollSt = pollSt.dropna(axis='rows')  # Delete rows with NAs
pollSt = pollSt[pollSt.State!='Country Of Mexico']  # Delete Mexico
pollSt['Date Local'] = pd.to_datetime(pollSt['Date Local'],format='%Y-%m-%d')  # Change date from string to date value
pollSt = pollSt.groupby(['State','Date Local']).mean()  # Take mean values if there are depulicated entries
pollStGrouped = pollSt.groupby(level=0)

In [None]:
pollSt.info


## Lists bottom 5 records

In [None]:
pollSt.tail(5)

## Lists top 15 records

In [None]:
pollSt.head(15)

## Renaming the columns names


In [None]:
pollSt = pollSt.rename(columns={'NO2 AQI': 'NO2_AQI', 'O3 AQI': 'O3_AQI', 'SO2 AQI': 'SO2_AQI', 'CO AQI': 'CO_AQI' })
pollSt.head(5)

## Creating a correlation matrix

In [None]:
pollSt.corr()

## creating a heatmap for correlation

In [None]:
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
c= pollSt.corr()
sns.heatmap(c,cmap='BrBG',annot=True)
c

## Scatterplots

In [None]:
# Plotting a scatter plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(pollSt['CO_AQI'], pollSt['NO2_AQI'])
ax.set_xlabel('CO_AQI')
ax.set_ylabel('NO2_AQI')
plt.show()