In [11]:
import numpy as np
import pandas as pd

### AirQualityUCI Dataset

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/air+quality

In [None]:
air_qual_data = pd.read_csv('./raw_data/AirQualityUCI_fixed.csv')

In [None]:
air_qual_data

In [None]:
# Converting Date and Time to a numeric representation 

air_qual_data = air_qual_data.drop(["Date", "Time"], axis=1)
air_qual_data.insert(0, "Time", air_qual_data.index) 

In [None]:
air_qual_data.isnull().values.any()

In [None]:
# Calculating % of missing data for each feature
# As per the data set info missing values are tagged with -200. 

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == -200).sum() / total_rows)

In [None]:
# From above we can see that several feature have above 10% missing data so we will go ahead and discard those feature because they are not reliable enough to use for modeling. 

air_qual_data = air_qual_data.drop(air_qual_data.iloc[:, [1, 3, 6, 8]], axis=1)

In [None]:
# Now since the number of row for the remaining feature is fairly large we will just drop the rows that have -200

air_qual_data = air_qual_data.replace([-200], np.nan)
air_qual_data = air_qual_data.dropna()

In [None]:
# Checking to make sure all -200 values rows are removed

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == -200).sum() / total_rows)

In [None]:
# Checking to make sure all NaN values rows are removed

total_rows = air_qual_data.shape[0]

for each in range(0, air_qual_data.shape[1]):
    print(each, air_qual_data.columns[each] + ":  ", (air_qual_data.iloc[:, each] == None).sum())

In [None]:
air_qual_data = air_qual_data.reset_index(drop=True)

In [None]:
air_qual_data

In [None]:
target = air_qual_data.pop("PT08.S4(NO2)")
air_qual_data.insert(9, "PT08.S4(NO2)", target)

In [None]:
# Output to .csv in cleaned_data directory

air_qual_data.to_csv('./cleaned_data/AirQualityUCI_fixed_cleaned.csv')

### AutoMPG Dataset

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/auto+mpg

In [None]:
auto_mpg_data = pd.read_csv('./raw_data/auto-mpg_fixed.csv')

In [None]:
auto_mpg_data.isnull().values.any()

In [None]:
# Since I got the data set from the original website -> https://archive.ics.uci.edu/ml/datasets/auto+mpg data was in wrong format and had missing values
# Following code fixes that

In [None]:
auto_mpg_data.drop(auto_mpg_data.loc[auto_mpg_data['horsepower']=='?'].index, inplace=True)

In [None]:
auto_mpg_data.drop(columns='car_name', inplace=True)

In [None]:
df1 = auto_mpg_data.pop('mpg')
auto_mpg_data['mpg'] = df1

In [None]:
auto_mpg_data

In [None]:
auto_mpg_data.to_csv('./cleaned_data/auto_mpg_fixed_cleaned.csv')

### forestfires Dataset

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/forest+fires

In [None]:
# Need to convert categorical data to numeric data
# Month and Day columns must be converted
# For month
# jan = 1, feb = 2, mar = 3, ..., etc.
# For day
# mon = 1, tue = 2, wed = 3, ..., etc. 

In [None]:
# Area is the target value as per the original dataset notes

""" 13. area - the burned area of the forest (in ha): 0.00 to 1090.84
(this output variable is very skewed towards 0.0, thus it may make
sense to model with the logarithm transform). """

In [2]:
forest_fires_data = pd.read_csv('./raw_data/forestfires.csv')

In [3]:
forest_fires_data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [4]:
forest_fires_data.isnull().values.any()

False

In [5]:
forest_fires_data['month'].unique()

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

In [6]:
# For months

forest_fires_data = forest_fires_data.replace("jan", 1)
forest_fires_data = forest_fires_data.replace("feb", 2)
forest_fires_data = forest_fires_data.replace("mar", 3)
forest_fires_data = forest_fires_data.replace("apr", 4)
forest_fires_data = forest_fires_data.replace("may", 5)
forest_fires_data = forest_fires_data.replace("jun", 6)
forest_fires_data = forest_fires_data.replace("jul", 7)
forest_fires_data = forest_fires_data.replace("aug", 8)
forest_fires_data = forest_fires_data.replace("sep", 9)
forest_fires_data = forest_fires_data.replace("oct", 10)
forest_fires_data = forest_fires_data.replace("nov", 11)
forest_fires_data = forest_fires_data.replace("dec", 12)


In [7]:
forest_fires_data['day'].unique()

array(['fri', 'tue', 'sat', 'sun', 'mon', 'wed', 'thu'], dtype=object)

In [8]:
# For months

forest_fires_data = forest_fires_data.replace("mon", 1)
forest_fires_data = forest_fires_data.replace("tue", 2)
forest_fires_data = forest_fires_data.replace("wed", 3)
forest_fires_data = forest_fires_data.replace("thu", 4)
forest_fires_data = forest_fires_data.replace("fri", 5)
forest_fires_data = forest_fires_data.replace("sat", 6)
forest_fires_data = forest_fires_data.replace("sun", 7)


In [19]:
forest_fires_data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log_area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00,-inf
1,7,4,10,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00,-inf
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00,-inf
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00,-inf
4,8,6,3,7,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00,-inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,8,7,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44,1.862529
513,2,4,8,7,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29,3.994340
514,7,4,8,7,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16,2.412336
515,1,4,8,6,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00,-inf


In [10]:
# Preforming log transform of the area column to make results better

In [32]:
forest_fires_data ['log_area'] = np.log(forest_fires_data['area'] * 10000)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
# Replace -Inf values with 0

In [33]:
forest_fires_data.replace(float("-inf"), 0, inplace=True)

In [34]:
forest_fires_data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log_area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00,0.000000
1,7,4,10,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00,0.000000
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00,0.000000
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00,0.000000
4,8,6,3,7,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,8,7,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44,11.072869
513,2,4,8,7,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29,13.204680
514,7,4,8,7,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16,11.622676
515,1,4,8,6,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00,0.000000


In [35]:
forest_fires_data.to_csv('./cleaned_data/forestfires_cleaned.csv')

### Bike Sharing Hourly Data Set

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
# Target variable: "cnt" column -> number of bikes in use

In [None]:
bike_sharing_df = pd.read_csv('./raw_data/og_data/bike_sharing_hour.csv')

In [None]:
bike_sharing_df.isnull().values.any()

In [None]:
bike_sharing_df.drop(["instant", "dteday"], axis = 1, inplace=True)

In [None]:
bike_sharing_df

In [None]:
# I visually inspected the data in excel and it looks fine so I am going to export to cleaned_data folder

In [None]:
bike_sharing_df.to_csv('./cleaned_data/bike_sharing_hour_tmp.csv')

### Combined Cycle Power Plant Data Set (CCPP)

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant
# Target variable: "PE" column -> basically exectric output of the power plant

In [None]:
CCPP_df = pd.read_csv('./raw_data/og_data/CCPP.csv')

In [None]:
CCPP_df.isnull().values.any()

In [None]:
# I visually inspected the data in excel and it looks fine so I am going to export to cleaned_data folder

In [None]:
CCPP_df.to_csv('./cleaned_data/CCPP.csv')

### White Wine Quality Data Set

In [None]:
# Info on this set can be found @ https://archive.ics.uci.edu/ml/datasets/wine+quality
# Target variable: "quality" column -> quality score of the wine

In [None]:
wine_quality_df = pd.read_csv('./raw_data/winequality-white_fixed.csv')

In [None]:
wine_quality_df.isnull().values.any()

In [None]:
# I visually inspected the data in excel and it looks fine so I am going to export to cleaned_data folder

In [None]:
wine_quality_df.to_csv('./cleaned_data/winequality-white_fixed.csv')