In [2]:
import pandas as pd

air = pd.read_csv('airquality.csv')
air.head()

Unnamed: 0.1,Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,1,41.0,190.0,7.4,67,5,1
1,2,36.0,118.0,8.0,72,5,2
2,3,12.0,149.0,12.6,74,5,3
3,4,18.0,313.0,11.5,62,5,4
4,5,,,14.3,56,5,5


In [3]:
ozone_median = air['Ozone'].median()
print("Ozone Median is:", ozone_median)

ozone_std = air['Ozone'].std()
print("Ozone standard deviation is:", ozone_std)


print()
print("For Solar.R: ")
solar_median = air['Solar.R'].median()
print("Solar Median is:", solar_median)

solar_std = air['Solar.R'].std()
print("Solar standard deviation is:", solar_std)

Ozone Median is: 31.5
Ozone standard deviation is: 32.98788451443395

For Solar.R: 
Solar Median is: 205.0
Solar standard deviation is: 90.05842222838167


In [4]:
na_values = air.isnull().sum()
print("NA values in each column are:", na_values)


NA values in each column are: Unnamed: 0     0
Ozone         37
Solar.R        7
Wind           0
Temp           0
Month          0
Day            0
dtype: int64


In [5]:
# Filling null values with mean, mode and median

# air["Solar.R"].fillna(solar_mean, inplace = True)
# air["Solar.R"].fillna(solar_median, inplace = True)
air["Solar.R"].fillna(solar_mode[1], inplace = True)

print("Filling na values in solar column with its mode.")
print(air)
print("Null values:", air["Solar.R"].isnull().sum())

NameError: name 'solar_mode' is not defined

In [6]:
# Data Transformation
air['Solar.Danger'] = air['Solar.R'].apply(lambda x: "Yes" if x > 100 else "No")
print(air.head())
print(air.tail())

   Unnamed: 0  Ozone  Solar.R  Wind  Temp  Month  Day Solar.Danger
0           1   41.0    190.0   7.4    67      5    1          Yes
1           2   36.0    118.0   8.0    72      5    2          Yes
2           3   12.0    149.0  12.6    74      5    3          Yes
3           4   18.0    313.0  11.5    62      5    4          Yes
4           5    NaN      NaN  14.3    56      5    5           No
     Unnamed: 0  Ozone  Solar.R  Wind  Temp  Month  Day Solar.Danger
148         149   30.0    193.0   6.9    70      9   26          Yes
149         150    NaN    145.0  13.2    77      9   27          Yes
150         151   14.0    191.0  14.3    75      9   28          Yes
151         152   18.0    131.0   8.0    76      9   29          Yes
152         153   20.0    223.0  11.5    68      9   30          Yes


In [8]:
bins = [x for x in range(0, 399, 50)]
print(bins)

labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
# labels = ['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350']
print(labels)
air['Solar.R.Interval'] = pd.cut(air['Solar.R'], bins = bins, labels = labels, include_lowest = True)
print(air.head())

[0, 50, 100, 150, 200, 250, 300, 350]
['0-50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350']
   Unnamed: 0  Ozone  Solar.R  Wind  Temp  Month  Day Solar.Danger  \
0           1   41.0    190.0   7.4    67      5    1          Yes   
1           2   36.0    118.0   8.0    72      5    2          Yes   
2           3   12.0    149.0  12.6    74      5    3          Yes   
3           4   18.0    313.0  11.5    62      5    4          Yes   
4           5    NaN      NaN  14.3    56      5    5           No   

  Solar.R.Interval  
0          150-200  
1          100-150  
2          100-150  
3          300-350  
4              NaN  


In [9]:
# Replace Month number with month name

month_mapping = {5: 'may', 6: 'june', 7: 'july', 8: 'august', 9: 'september'}
air['Month'] = air['Month'].map(month_mapping)
print(air.head())
print(air.tail())

   Unnamed: 0  Ozone  Solar.R  Wind  Temp Month  Day Solar.Danger  \
0           1   41.0    190.0   7.4    67   may    1          Yes   
1           2   36.0    118.0   8.0    72   may    2          Yes   
2           3   12.0    149.0  12.6    74   may    3          Yes   
3           4   18.0    313.0  11.5    62   may    4          Yes   
4           5    NaN      NaN  14.3    56   may    5           No   

  Solar.R.Interval  
0          150-200  
1          100-150  
2          100-150  
3          300-350  
4              NaN  
     Unnamed: 0  Ozone  Solar.R  Wind  Temp      Month  Day Solar.Danger  \
148         149   30.0    193.0   6.9    70  september   26          Yes   
149         150    NaN    145.0  13.2    77  september   27          Yes   
150         151   14.0    191.0  14.3    75  september   28          Yes   
151         152   18.0    131.0   8.0    76  september   29          Yes   
152         153   20.0    223.0  11.5    68  september   30          Yes   

  

In [11]:
# Replace Month number with month name using for loop
air_loop = air.copy()
for index, row in air_loop.iterrows():
    air_loop.at[index, 'Month'] = month_mapping.get(row['Month'])
print(air.head())
print(air.tail())

   Unnamed: 0  Ozone  Solar.R  Wind  Temp Month  Day Solar.Danger  \
0           1   41.0    190.0   7.4    67   may    1          Yes   
1           2   36.0    118.0   8.0    72   may    2          Yes   
2           3   12.0    149.0  12.6    74   may    3          Yes   
3           4   18.0    313.0  11.5    62   may    4          Yes   
4           5    NaN      NaN  14.3    56   may    5           No   

  Solar.R.Interval  
0          150-200  
1          100-150  
2          100-150  
3          300-350  
4              NaN  
     Unnamed: 0  Ozone  Solar.R  Wind  Temp      Month  Day Solar.Danger  \
148         149   30.0    193.0   6.9    70  september   26          Yes   
149         150    NaN    145.0  13.2    77  september   27          Yes   
150         151   14.0    191.0  14.3    75  september   28          Yes   
151         152   18.0    131.0   8.0    76  september   29          Yes   
152         153   20.0    223.0  11.5    68  september   30          Yes   

  

In [None]:
# Remove duplicate rows

air_no_duplicates = air.drop_duplicates()

print("Number of duplicated rows:", air.duplicated().sum())
print("number of rows after removing duplicates:", len(air_no_duplicates))

In [None]:
# change data type

print(air.dtypes)
print(air.head())
air['Wind'] = air['Wind'].astype(int)

print("After data type conversion: ")
print(air.head())
print(air.dtypes)

In [None]:
# Normalize the 'ozone' and 'solar.r' columns

from sklearn.preprocessing import MinMaxScaler

print(air[['Ozone', 'Solar.R']].head())
scaler = MinMaxScaler()
air[['Ozone', 'Solar.R']] = scaler.fit_transform(air[['Ozone', 'Solar.R']])

print("\nAfter Scaling: ")
print(air[['Ozone', 'Solar.R']].head())

In [None]:
# Fix Data Inconsistencies

print(air['Month'].unique())

air['Month'] = air['Month'].str.capitalize()

print(air['Month'].unique())

In [None]:
air_cleaned = air.drop(['Month'], axis = 1)

print(air_cleaned.columns)

In [None]:
# inspecting "Ozone"
print("For Ozone: ")
ozone_mean = air['Ozone'].mean()
print("Mean:", ozone_mean)

ozone_median = air['Ozone'].median()
print("Median:", ozone_median)

ozone_mode = air['Ozone'].mode()
print("Mode:", ozone_mode)

ozone_min = air['Ozone'].min()
ozone_max = air['Ozone'].max()
print("Ozone minimum and maximum values are:", ozone_min, "and", ozone_max)

ozone_std = air['Ozone'].std()
print("Ozone standard deviation is:", ozone_std)


print()
print("For Solar.R: ")
solar_mean = air['Solar.R'].mean()
print("Mean:", solar_mean)

solar_median = air['Solar.R'].median()
print("Median:", solar_median)

solar_mode = air['Solar.R'].mode()
print("Mode:", solar_mode)

solar_min = air['Solar.R'].min()
solar_max = air['Solar.R'].max()
print("Solar.R minimum and maximum values are:", solar_min, "and", solar_max)

solar_std = air['Solar.R'].std()
print("Solar radiation standard deviation is:", solar_std)


In [None]:
# Data cleaning
air_cleaned = air.dropna(subset=['Ozone'])
print("After cleaning: ")
na_values = air_cleaned.isnull().sum()
print("NA values in each column are:", na_values)

summary_air_cleaned = air_cleaned['Ozone'].info()
print("Summary:\n", summary_air_cleaned)

In [None]:
# Data cleaning
air_cleaned_notna = air

for index in range(len(air)):
    if (pd.isna(air_cleaned_notna.loc[index, 'Ozone'])):
        air_cleaned_notna.drop(index, inplace = True)


na_values = air_cleaned_notna.isnull().sum()
print("After cleaning: ")
print("NA values in each column are:", na_values)


In [None]:
# Data cleaning
air_cleaned = air.dropna(subset=['Solar.R'])
print("After cleaning: ")
na_values = air_cleaned.isnull().sum()
print("NA values in each column are:", na_values)
print()

summary_air_cleaned = air_cleaned['Solar.R'].info()
print("Summary:\n", summary_air_cleaned)
print("First five: \n", air_cleaned.head())