## Data Cleansing for California Environmental Data

In [2]:
import pandas as pd
from ydata_profiling import ProfileReport

In [6]:
#!pip install liac-arff
import arff

with open('CA_enviromental', 'r') as f:
    data = arff.load(f)
df = pd.DataFrame(data['data'], columns=[attr[0] for attr in data['attributes']])

In [7]:
display(df.head(3))
print(df.info())

Unnamed: 0,Stn_Id,Stn_Name,CIMIS_Region,Date,ETo_(in),Precip_(in),Sol_Rad_(Ly/day),Avg_Vap_Pres_(mBars),Max_Air_Temp_(F),Min_Air_Temp_(F),Avg_Air_Temp_(F),Max_Rel_Hum_(%),Min_Rel_Hum_(%),Avg_Rel_Hum_(%),Dew_Point_(F),Avg_Wind_Speed_(mph),Wind_Run_(miles),Avg_Soil_Temp_(F),Target
0,2,FivePoints,San Joaquin Valley,1/1/2018,0.06,0.0,219.0,7.3,63.4,35.3,47.8,82.0,46.0,65.0,36.6,3.3,78.3,51.1,0
1,2,FivePoints,San Joaquin Valley,1/2/2018,0.04,0.0,127.0,7.4,59.8,37.7,47.2,80.0,52.0,67.0,36.7,3.1,74.5,51.3,0
2,2,FivePoints,San Joaquin Valley,1/3/2018,0.04,0.0,125.0,8.4,61.1,37.3,49.9,79.0,49.0,68.0,39.9,4.5,107.5,51.3,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128125 entries, 0 to 128124
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Stn_Id                128125 non-null  int64  
 1   Stn_Name              128125 non-null  object 
 2   CIMIS_Region          128125 non-null  object 
 3   Date                  128125 non-null  object 
 4   ETo_(in)              128042 non-null  float64
 5   Precip_(in)           128125 non-null  float64
 6   Sol_Rad_(Ly/day)      128125 non-null  float64
 7   Avg_Vap_Pres_(mBars)  128125 non-null  float64
 8   Max_Air_Temp_(F)      128122 non-null  float64
 9   Min_Air_Temp_(F)      128124 non-null  float64
 10  Avg_Air_Temp_(F)      128120 non-null  float64
 11  Max_Rel_Hum_(%)       128125 non-null  float64
 12  Min_Rel_Hum_(%)       128125 non-null  float64
 13  Avg_Rel_Hum_(%)       128112 non-null  float64
 14  Dew_Point_(F)         128112 non-null  float64
 15  

In [11]:
# Missing Values Count & Percentage
missing_percentage = df.isna().mean() * 100
missing_info = pd.DataFrame({
    'Missing Values': df.isna().sum(),
    'Missing Percentage': missing_percentage
})
print(missing_info)
# No Missing Data for the Response & vary few missing values for 7 features.

                      Missing Values  Missing Percentage
Stn_Id                             0            0.000000
Stn_Name                           0            0.000000
CIMIS_Region                       0            0.000000
Date                               0            0.000000
ETo_(in)                          83            0.064780
Precip_(in)                        0            0.000000
Sol_Rad_(Ly/day)                   0            0.000000
Avg_Vap_Pres_(mBars)               0            0.000000
Max_Air_Temp_(F)                   3            0.002341
Min_Air_Temp_(F)                   1            0.000780
Avg_Air_Temp_(F)                   5            0.003902
Max_Rel_Hum_(%)                    0            0.000000
Min_Rel_Hum_(%)                    0            0.000000
Avg_Rel_Hum_(%)                   13            0.010146
Dew_Point_(F)                     13            0.010146
Avg_Wind_Speed_(mph)               0            0.000000
Wind_Run_(miles)               

In [17]:
# Missing values only account for less than 0.1% of total data, thus remove the rows with missing values.
df = df.dropna()

# Drop Duplicate Rows
df = df.drop_duplicates()

In [18]:
# Basic Data Profiling
profile = ProfileReport(df, title="CA_Environmental_Profiling_Report")
profile.to_file("CA_Environmental_Profiling_Report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
df.to_csv('Cleaned_CA_Env_Data.csv', index=False)