In [None]:
import numpy as np
import pandas as pd

### AirQualityUCI Dataset

In [None]:
air_qual_data = pd.read_csv('./raw_data/AirQualityUCI_fixed.csv')

In [None]:
air_qual_data

In [None]:
air_qual_data.isnull().values.any()

In [None]:
# Calculating % of missing data for each feature
# As per the data set info at https://archive.ics.uci.edu/ml/datasets/air+quality missing values are tagged with -200. 

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == -200).sum() / total_rows)

In [None]:
# From above we can see that several feature have above 10% missing data so we will go ahead and discard those feature because they are not reliable enough to use for modeling. 

air_qual_data = air_qual_data.drop(air_qual_data.iloc[:, [2, 4, 7, 9]], axis=1)

In [None]:
# Now since the number of row for the remaining feature is fairly large we will just drop the rows that have -200

air_qual_data = air_qual_data.replace([-200], np.nan)
air_qual_data = air_qual_data.dropna()

In [None]:
# Checking to make sure all -200 values rows are removed

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == -200).sum() / total_rows)

In [None]:
# Checking to make sure all NaN values rows are removed

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == None).sum())

In [None]:
air_qual_data

In [None]:
# Output to .csv in cleaned_data directory

air_qual_data.to_csv('./cleaned_data/AirQualityUCI_fixed_cleaned.csv')

### AutoMPG Dataset

In [None]:
auto_mpg_data = pd.read_csv('./raw_data/auto-mpg_fixed.csv')

In [None]:
auto_mpg_data.isnull().values.any()

In [None]:
# Since I got the data set from the original website -> https://archive.ics.uci.edu/ml/datasets/auto+mpg data was in wrong format and had missing values
# Following code fixes that

In [None]:
auto_mpg_data.drop(auto_mpg_data.loc[auto_mpg_data['horsepower']=='?'].index, inplace=True)

In [None]:
auto_mpg_data.drop(columns='car_name', inplace=True)

In [None]:
df1 = auto_mpg_data.pop('mpg')
auto_mpg_data['mpg'] = df1

In [None]:
auto_mpg_data

In [None]:
auto_mpg_data.to_csv('./cleaned_data/auto_mpg_fixed_cleaned.csv')

### forestfires Dataset

In [None]:
#forestfires dataset looks good to go as is

In [None]:
forest_fires_data = pd.read_csv('./raw_data/forestfires.csv')

In [None]:
forest_fires_data

In [None]:
forest_fires_data.isnull().values.any()