# Exploratory Data Analysis

In [None]:
'''
While loading a downloaded colab python file on github repository getting error as:
Invalid Notebook
There was an error rendering your Notebook: the 'state' key is missing from 'metadata.widgets'. Add 'state' to each, or remove 'metadata.widgets'.
Using nbformat v5.10.4 and nbconvert v7.16.6. all commands you suggested above, where should i execute those commands
T
his means the notebook file contains corrupted widget metadata — common in Colab or Jupyter when using interactive widgets like sliders or dropdowns.

Solution:
Open anaconda command prompt
Type following commands:

pip install nbstripout nbconvert

nbstripout C:\Users\excel\Downloads\EDA_1_New.ipynb
'''

In [None]:
#load the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (10,5) # RuntimeConfiguration Parameters: size of graph, 10:width, 5:height
plt.rcParams['figure.dpi'] = 300 # Resolution dots per inches
%matplotlib inline
# after plotting graph, many times depending on version of working library like matplotlib graph will not be displayed in output screen below
# For that we have to write everytime plt.show(). So if you write '%matplotlib inline' we don't need to write show() method.
import warnings # any library will give you future warnings regarding updates in functions. To ignore it write this line
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/content/data_clean1.csv") # No index column will be displayed
df

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.describe()
# Positive Skewness is there in Ozone as mean is greater than median i.e.41.58>30.50 so outliers are present on upper extreme
# null values from count
# Year value is same for all values so Year column is not important here. Check variance of column. Year's std is also 0, so it's not important column
# You can check it by applying unique() function on Year column, you will get only one value 2010


In [None]:
df.describe(include=object)
# There are 8 columns but in above code we got descriptive statistics for only numeric colums
# But if you check month and weater column, month is also numeric. But may be because of a single text value data type of entire column has become object.
# in weather 3 missing values are there
# top: the value having high frequency in that column, S is used 59 times in Weather col., 9 is used 34 times in Month col.

In [None]:
#data types
df.dtypes # check data types of all variables
# Month column's data type is wrong

In [None]:
df.info() # find missing values

# Data type conversion

In [None]:
df['Month']

In [None]:
df['Month'].unique()
# May is present so data type is object. We can replace it with 5 as 5th month is May. If can't guess replace is with NaN

In [None]:
df['Month'].value_counts()

In [None]:
df['Month'].replace('May','5',inplace=True) # replacing only value not the datatype

In [None]:
df['Month'].unique()

In [None]:
df.dtypes

In [None]:
df['Month'].astype(int) # no permanent changes, inplace=True is not allowed here

In [None]:
df.dtypes

In [None]:
df['Month'] = df['Month'].astype(int)

In [None]:
df.dtypes # now Month's datatype is changed

# **Duplicates**

In [None]:
df

In [None]:
df.duplicated()
# If duplicate values are there your model will not learn any new thing from it.

In [None]:
df.duplicated().sum() # index 157 is duplicate record of other record

In [None]:
# print duplicated values. keep=(first,last,false),keep=first 1st record as original and rest as duplicate
# default for keep is first. keep=false will treat all similar records as duplicate
df[df.duplicated()]

In [None]:
df[df.duplicated(keep = False)]

In [None]:
# Drop Duplicated records

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.duplicated().sum()

# **Drop Columns**

In [None]:
df.head()

In [None]:
df.tail(10)

In [None]:
df['Year'].value_counts()

In [None]:
df.drop(columns=['Year'],inplace=True)

In [None]:
df.head()

In [None]:
# To drop multiple columns: will drop only for display purpose
df.drop(columns=['Ozone','Month'])# as inplace=True not written, changes are temporary

In [None]:
df # Ozone and Month colums are present in dataframe

In [None]:
# to drop a record or row
df.drop(index=[1]) # Temporary changes

In [None]:
# drop multiple rows
df.drop(index=[1,3])

# **Rename the Column**

In [None]:
df.head()

In [None]:
df

In [None]:
#rename the Solar column, Pass parameter in dictionary form
df.rename(columns={'Solar.R': 'Solar','Temp':'Temperature'}, inplace=True)

In [None]:
df

# **Missing Value Imputation**
Reasons of missing values:
*   Data entry errors
*   Issues with machines
*   Improper data handling, etc.
## **Treatment**
*   0 to 5 % missing values in a column: drop rows
*   6 to 45 % missing values in a column: replace the missing values
* 50% missing values in a column: drop column





In [None]:
df.head()

In [None]:
df.isna() # or isnull(), True: missing value

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
# visualize missing values
sns.heatmap(df.isna()) # white horizontal lines are missing values

In [None]:
df.isna().sum()

In [None]:
len(df)

In [None]:
# % of missing values
for i in df.isna().sum():
  print((i/len(df))*100)
# Ozone: replace missing values as missing values are in the range of 6 to 45%
# Solar, Weather: drop missing values
# but we will treat missing values
# Ozone and Solar are numeric columns. If outliers are present in that column, replace missing values with median,
# if no outliers in column replace missing value with mean
# if categorical column replace missing value with mode.

# 0 to 5 % missing values: drop rows
# 6 to 45 %: replace the missing values
# 50%: drop column

In [None]:
df.head()

In [None]:
df.hist()
plt.tight_layout() # avoid overlapping of labels of graphs
# Skewness: Less than -1 or greater than +1 high skewness, Ozone
# -0.5 to +0.5: Moderate skewness, Solar
# 0: Normal

In [None]:
df.boxplot()
# Ozone: outliers are present so replace missing values with median
# In Solar: No outliers so replace missing values with mean

In [None]:
sns.boxplot(x = df['Wind']) # for vertical boxplot: sns.boxplot(y = df['Wind'])

In [None]:
ozone_median = df['Ozone'].median()
ozone_median

In [None]:
df['Ozone'].fillna(ozone_median,inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['Solar'].mean()

In [None]:
df['Solar'].fillna(df['Solar'].mean(),inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['Weather'].value_counts()

In [None]:
df['Weather'].mode()

In [None]:
df['Weather'].mode()[0] # for multimodal values select index 0 mode

In [None]:
df['Weather'].fillna(df['Weather'].mode()[0],inplace=True)

In [None]:
df.isna().sum()

In [None]:
df['Weather'].value_counts()

# **Outlier Detection**
*   Histogram
*   Boxplot
*   Descriptive Statistics



In [None]:
df.describe()
# getting idea that Ozone and Wind columns have outliers as huge gap in between 75% and max value

In [None]:
df.hist()
plt.tight_layout()
# same outlier information is visible here. Confirm with boxplot

In [None]:
df.boxplot()

In [None]:
sns.boxplot(x = df['Ozone'])

# **Outlier Treatment**

*   Capping: Replacing outlier values is called capping
*   In Capping all outlier values will be replaced by upper extreme



In [None]:
def UserFunction():
  print("Hello")
  print("Function Introduction")

In [None]:
UserFunction()

In [None]:
UserFunction(10)

In [None]:
len()

In [None]:
len(df)

In [None]:
def add(num1,num2):
  print(num1 + num2)

In [None]:
add(10)

In [None]:
add(110,20)

In [None]:
def arithmatic(num1,num2):
  print("Addition:",num1 + num2)
  print("Subtraction:",num1 - num2)
  print("Multiplication:",num1 * num2)
  print("Division:",num1 / num2)

In [None]:
arithmatic(40,4)

In [None]:
"Data" + "Science"

In [None]:
"Data" + " " + "Science"

In [None]:
def addText(text1,text2):
  print(text1 + " " + text2)

In [None]:
addText("String","Concatenation")

In [None]:
df['Ozone'].quantile(0.25) # lower quartile: q1

In [None]:
df['Ozone'].quantile(0.75) # upper quartile: q3

In [None]:
# Outlier Detection: User defined Function to calculate Upper Extreme and Lower Extreme value
def outlier_detection(data,colname):
  q1 = data[colname].quantile(0.25)
  q3 = data[colname].quantile(0.75)
  iqr = q3 - q1

  upper_extreme = q3 + (1.5 * iqr)
  lower_extreme = q1 - (1.5 * iqr)

  return lower_extreme, upper_extreme,q1,q3

In [None]:
outlier_detection(df,'Ozone')

In [None]:
outlier_detection(df,'Wind') # try for 'Solar' or other columns

In [None]:
# Additional Code
import plotly.express as pe

In [None]:
# Additional Code
pe.scatter(x = df['Ozone'], y = df['Temperature'])

In [None]:
# extract all values of Ozone column which are greater than upper extreme i.e 81
df[df['Ozone']>81.0]

In [None]:
df[df['Ozone']>81.0].shape[0]

In [None]:
# OR
len(df[df['Ozone']>81.0])

In [None]:
# Now all these 16 outlier values of Ozone column should be replaced by upper extreme
df.loc[df['Ozone']>81.0,'Ozone']

In [None]:
# Capping Outliers of Ozone column
df.loc[df['Ozone']>81.0,'Ozone'] = 81.0

In [None]:
df[df['Ozone'] > 81.0]
# No values are displayed.. as all outliers are replaced by 81.0

In [None]:
df[df['Ozone'] == 81.0]
# confirm index values for which outliere are replaced - 21,61,68....
# check index in output of command lines 109 or 113

In [None]:
# Do it for 'Wind' column also
outlier_detection(df,'Wind')

In [None]:
sns.boxplot(x = df['Wind'])

In [None]:
# Upper extreme value for Wind column is 17.65 so replace all values of Wind column which are greater than 17.65
df[df['Wind'] > 17.65]

In [None]:
# Capping Outliers of Wind column
df.loc[df['Wind']>17.65,'Wind'] = 17.65

In [None]:
df.boxplot()
# Now there are no outliers in any column

In [None]:
df.shape

# **Scatter Plot and Correlation**

In [None]:
df.head()

In [None]:
sns.pairplot(df) # relationship between all nemerical columns
# check graph of month(x axis) vs Temperature(y axis)
# We got 5 vertical lines in it because datatype of Month is Discrete numerical value. It has only 5 unique discrete numerical values.
# It shows no relationship between month and temperature

In [None]:
df['Weather'].value_counts()

In [None]:
# Correlation Coefficient
df.corr(numeric_only=True)
# Ozone vs Ozone corr is 1.0 so ignore diagonal values
# Ozone vs Temperature corr is 0.66 i.e. positive moderate correlation
# For large no. of columns go for heatmap visualization

In [None]:
sns.heatmap(df.corr(numeric_only=True))
# See sidebar which shows -1 to +1 values from bottom to top
# On the side vertical bar Light color indicates positive strong relationship.
# Dark black color indicates strong negative relationship
# For clear understanging pass parameters to heatmap()

In [None]:
sns.heatmap(df.corr(numeric_only=True),annot=True)
# colors are representing correlation value also

In [None]:
# we can change color combination also
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='viridis')

In [None]:
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='rainbow') # cmap='crest'

# **Transformations**


*   Convert Text data to numeric before model building



In [None]:
df.head()

In [None]:
# Encoding using Pandas dummy function
df = pd.get_dummies(data = df, columns = ['Weather'])

In [None]:
df

# **Scaling the data**

*   Bring down all columns to same scale

*   Normalization: Scales value within the range 0 to 1

*   Standardization: Uses Z score for scaling. Scale values in such a way that the mean=0 and std=1.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# Standardization
sc = StandardScaler() # ZScore=(x-x bar)/std

In [None]:
# sc.fit(): 1st Mean and Std of Ozone column is calculated. Then in numerator subtraction is done i.e. x-mean.
#           Then answer is divided by Std. Thus z-score is calculated. All the calculations are done by fit() function.
# sc.transform(): Original value in table is replaced by transform() function. i.e. 41.0 in Ozone column is replaced by it's z-score as 0.1901
# sc.fit_transform(): does both of the above tasks.

In [None]:
sc.fit_transform(df)

In [None]:
pd.DataFrame(sc.fit_transform(df)) # got table but column names are not visible. Index is there.

In [None]:
df.columns # display all column names in sequence

In [None]:
# convert scaled array into dataframe. For this standardized data mean=0 and std=1
scaled_data = pd.DataFrame(sc.fit_transform(df),columns=df.columns)
scaled_data

In [None]:
# check if mean=0 and std=1
scaled_data.describe()

# **MinMaxScaler()**

In [None]:
mn = MinMaxScaler()

In [None]:
df.columns

In [None]:
minmax_scaled = pd.DataFrame(mn.fit_transform(df), columns=df.columns)
minmax_scaled # converted value is in range of 0 and 1

In [None]:
minmax_scaled.describe() # check min and max values as 0 and 1 respectively

# **Filling special symbols (which are not identified with isnull()) with NaN**

In [None]:
import pandas as pd
import numpy as np

# Sample dataset with "?" as missing values
data = {
    'Name': ['Amit', 'Neha', 'Chirag', '?'],
    'Age': [25, '?', 30, 28],
    'Score': [88, 92, '?', 95]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
df

In [None]:
df.isnull().sum()

In [None]:
df.replace("?", np.nan, inplace=True)
# if we want to replace multiple symbols at a time
# df.replace(["?", "#", "N/A"], np.nan, inplace=True)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
# Now you can impute missing values

In [None]:
!pip install pandas_profiling==3.1.0
!pip install -U ydata-profiling
!pip install sweetviz

In [None]:
import pandas_profiling as pp
import sweetviz as sv

In [None]:
df = pd.read_csv('data_clean.csv',index_col=0)
df.head(2)



*  For a dataset with less elements (unique values) than a given number (say 5) pandas_profiling assumes that your variable is categorical instead of numerical



In [None]:
EDA_report=pp.ProfileReport(df)
EDA_report

In [None]:
data = pd.read_csv("/content/data_clean.csv",index_col=0) # No index column will be displayed

In [None]:
# in your dataset Year column is not at all important. Its datatype is Intger. In this column unique values are only 1 i.e. 2010.
#

EDA_report= pp.ProfileReport(data,vars={"num":{"low_categorical_threshold":0}})
#vars is a parameter of ProdileReport(). Pass a dictionary to vars
# where you have to specify for all the numerical columns keep threshold value as 0 zero
# i.e. no. of unique values in all numerical values to be kept as zero.
# So even if you have zero values in your numerical column do not convert it into categorical value, keep it as integer only


In [None]:
EDA_report

In [None]:
sweet_report = sv.analyze(data)
sweet_report.show_html('weather_report.html')