In [13]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn import preprocessing,model_selection
from scipy.stats import pearsonr,zscore
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import GridSearchCV

In [14]:
trn_data= pd.read_excel(r"C:\Users\RONIT\Downloads\AirQualityUCI\AirQualityUCI.xlsx")

In [4]:
trn_data.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.869652,-159.090093,1.865576,894.475963,168.6042,794.872333,58.135898,1391.363266,974.951534,9.7766,39.483611,-6.837604
std,77.65717,329.817015,139.789093,41.380154,342.315902,257.424561,321.977031,126.931428,467.192382,456.922728,43.203438,51.215645,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.004958,711.0,50.0,637.0,53.0,1184.75,699.75,10.95,34.05,0.692275
50%,1.5,1052.5,-200.0,7.886653,894.5,141.0,794.25,96.0,1445.5,942.0,17.2,48.55,0.976823
75%,2.6,1221.25,-200.0,13.636091,1104.75,284.2,960.25,133.0,1662.0,1255.25,24.075,61.875,1.296223
max,11.9,2039.75,1189.0,63.741476,2214.0,1479.0,2682.75,339.7,2775.0,2522.75,44.6,88.725,2.231036


In [15]:
# missing values handle, removing feature columns which have more than 50% of missing values 
missing_columns = pd.DataFrame(trn_data.isnull().sum())
missing_columns.columns = ['Missing']
missing_columns.sort_values(by=['Missing'],ascending=False, inplace=True)
missing_columns = missing_columns[missing_columns.Missing>0]
missingList=missing_columns.index.tolist()
for col in missingList:
    print('{} : {:.2f} %.'.format(col, trn_data[col].isnull().sum()/len(trn_data)*100))    

In [16]:
# calculate Pearson's correlation, remove the features which have 
# less than +/-0.5 correlation with the dependent varriable
numeric_col=trn_data.select_dtypes(include=[np.number]).columns.tolist()
numeric_data=trn_data[numeric_col]
col_drop=[]
for cols in numeric_col:
    corr, _ = pearsonr(numeric_data[[cols]], numeric_data[['T']])
    if np.round(np.abs(corr[0]),2)<0.5:
        col_drop.append(cols)
trn_data.drop(columns=col_drop,inplace=True)        

In [17]:
# calculate the Z value and remove data where Z<+/-3
z_thresh = 3
constrains = trn_data.select_dtypes(include=[np.number]) \
        .apply(lambda x: np.abs(zscore(x)) < z_thresh, reduce=False) \
        .all(axis=1)
# Drop (inplace) values set to be rejected
trn_data.drop(trn_data.index[~constrains], inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# extracting and restructuring Date and Time features 
trn_data['year'] =   pd.DatetimeIndex(trn_data['Date']).year
trn_data['month'] = pd.DatetimeIndex(trn_data['Date']).month
trn_data['Time'] = trn_data['Time'].astype(str)
trn_data['Hour'] =  trn_data['Time'].str.split(':').str[0]
trn_data['Hour'] = trn_data['Hour'].astype(int)

In [19]:
# spliting the data into train and test set
trn_data.drop(columns=['Date','Time'],inplace=True)
x_train,x_test,y_train,y_test=model_selection.train_test_split(trn_data.drop(columns=['T']),trn_data[['T']],test_size=0.2)

In [10]:
x_train.head()

Unnamed: 0,PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3),RH,AH,year,month,Hour
935,1157.25,7.664791,885.25,916.25,1590.25,783.75,54.65,1.07995,2004,4,17
9047,1147.0,8.350804,913.5,632.75,1232.5,800.75,29.225,0.805458,2005,3,17
857,1454.25,17.508072,1225.0,660.5,1808.75,1654.25,42.125,0.811962,2004,4,11
7299,1371.25,13.636091,1104.75,552.5,1428.5,1580.5,63.550001,0.968104,2005,1,21
625,1499.25,17.994017,1239.25,650.75,1956.5,1358.75,47.475,1.019926,2004,4,19


In [20]:
# creating an xtreame gradient boost regression model
regressor=xgb.XGBRegressor(gamma=0,max_depth=10, 
                           n_estimators=100,reg_alpha=0, 
                           reg_lambda=1,objective='reg:squarederror'
                          ,subsample=0.5)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [21]:
print('MSE:',mean_squared_error(y_true=np.array(y_test), y_pred=y_pred))
print("R2 score:",r2_score(y_true=y_test, y_pred=y_pred))

MSE: 0.06604251374582559
R2 score: 0.9991350756112548
