# Exploratory Data Analysis

In [None]:
#load the libraries
import pandas as pd
import numpy as np


In [None]:
data1 = pd.read_csv("data_clean.csv",index_col=0)

In [None]:
data1

In [None]:
data1.tail(10)

In [None]:
data1

In [None]:
#Data Structure 
type(data1)
data1.shape

In [None]:
#data types
data1.dtypes

# Data type conversion

In [None]:
data1.info()

In [None]:
data1

In [None]:
data2=data1

In [None]:
data2

In [None]:
#The method .copy() is used here so that any changes made in new DataFrame don't get reflected in the original one
data=data2.copy()

In [None]:
data['Month']=pd.to_numeric(data['Month'],errors='coerce')
data['Temp C']=pd.to_numeric(data['Temp C'],errors='coerce')# coerce will introduce NA values for non numeric data in the columns
data['Weather']=data['Weather'].astype('category')           #data['Wind']=data['Wind'].astype('int64')

In [None]:
data.info()

# Duplicates

In [None]:
#Count of duplicated rows
data[data.duplicated()].shape

In [None]:
data

In [None]:
#Print the duplicated rows
data[data.duplicated()]

In [None]:
data_cleaned1=data.drop_duplicates()

In [None]:
data_cleaned1.shape

# Drop columns

In [None]:
data_cleaned2=data_cleaned1.drop('Temp C',axis=1)

In [None]:
data_cleaned2

# Rename the columns

In [None]:
#rename the Solar column
data_cleaned3 = data_cleaned2.rename({'Solar.R': 'Solar'}, axis=1)

In [None]:
data_cleaned3

# Outlier Detection

In [None]:
# histogram of Ozone
data_cleaned3['Ozone'].hist()

In [None]:
#Box plot
data_cleaned3.boxplot(column=['Ozone'])

In [None]:
#Descriptive stat
data_cleaned3['Ozone'].describe()

In [None]:
data_cleaned3

In [None]:
#Bar plot
data['Weather'].value_counts().plot.bar()

# Missing Values and Imputation

In [None]:
import seaborn as sns
cols = data_cleaned3.columns 
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(data_cleaned3[cols].isnull(),
            cmap=sns.color_palette(colours))

In [None]:
data_cleaned3[data_cleaned3.isnull().any(axis=1)].head()

In [None]:
data_cleaned3.isnull().sum()

In [None]:
#Mean Imputation
mean = data_cleaned3['Ozone'].mean()
print(mean)

In [None]:
data_cleaned3['Ozone'] = data_cleaned3['Ozone'].fillna(mean)

In [None]:
data_cleaned3

In [None]:
#Missing value imputation for categorical vlaue
#Get the object columns
obj_columns=data_cleaned3[['Weather']]

In [None]:
obj_columns.isnull().sum()

In [None]:
#Missing value imputation for categorical vlaue
obj_columns=obj_columns.fillna(obj_columns.mode().iloc[0])

In [None]:
obj_columns.isnull().sum()

In [None]:
data_cleaned3.shape

In [None]:
obj_columns.shape

In [None]:
#Join the data set with imputed object dataset
data_cleaned4=pd.concat([data_cleaned3,obj_columns],axis=1)

In [None]:
data_cleaned4.isnull().sum()

# Scatter plot and Correlation analysis

In [None]:
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(data_cleaned3)

In [None]:
#Correlation
data_cleaned3.corr()

# Transformations

#### Dummy Variable

In [None]:
data_cleaned4

In [None]:
#Creating dummy variable for Weather column
data_cleaned4=pd.get_dummies(data,columns=['Weather'])

In [None]:
data_cleaned4

In [None]:
data_cleaned4=data_cleaned4.dropna()

#### Normalization of the data

In [None]:
#Normalization of the data
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

In [None]:
data_cleaned4.values

In [None]:
array = data_cleaned4.values

scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(array)

#transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])


In [None]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler

In [None]:
array = data_cleaned4.values
scaler = StandardScaler().fit(array)
rescaledX = scaler.transform(array)

# summarize transformed data
set_printoptions(precision=2)
print(rescaledX[0:5,:])

# Speed up the EDA process

In [None]:
!pip install pandas-profiling==2.7.1
!pip install sweetviz

In [None]:
import pandas_profiling as pp
import sweetviz as sv

In [None]:
EDA_report= pp.ProfileReport(data1)
EDA_report.to_file(output_file='report.html')

In [None]:
sweet_report = sv.analyze(data)
sweet_report.show_html('weather_report.html')