In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('AirQualityUCI2.csv', sep=',', header=0, index_col=0)

### Renaming the headers

In [3]:
col_names = {'Date':'date', 'Time':'time', 'CO(GT)':'cogt', 'PT08.S1(CO)':'pt08s1co', 'NMHC(GT)':'nhmcgt','C6H6(GT)':'c6h6gt', 'PT08.S2(NMHC)':'pt08s2nhmc','NOx(GT)':'noxgt','PT08.S3(NOx)':'pt08s3nox','NO2(GT)':'no2gt','PT08.S4(NO2)':'pt08s4no2','PT08.S5(O3)':'pt08s503','T':'t','RH':'rh','AH':'ah','X1':'x1','X2':'x2','X3':'x3','X4':'x4','X5':'x5'}

In [4]:
df = df.rename(columns=col_names)

In [5]:
df.head()

Unnamed: 0_level_0,time,cogt,pt08s1co,nhmcgt,c6h6gt,pt08s2nhmc,noxgt,pt08s3nox,no2gt,pt08s4no2,pt08s503,t,rh,ah,x1,x2,x3,x4,x5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10/03/2004,18.00.00,2,6,1360,150,11,9,1046,166,1056,113,1692,1268,13,6,48.0,9.0,0.0,7578.0
10/03/2004,19.00.00,2,1292,112,9,4,955,103,1174,92,1559,972,13,3,47,7.0,0.0,7255.0,
10/03/2004,20.00.00,2,2,1402,88,9,0,939,131,1140,114,1555,1074,11,9,54.0,0.0,0.0,7502.0
10/03/2004,21.00.00,2,2,1376,80,9,2,948,172,1092,122,1584,1203,11,0,60.0,0.0,0.0,7867.0
10/03/2004,22.00.00,1,6,1272,51,6,5,836,131,1205,116,1490,1110,11,2,59.0,6.0,0.0,7888.0


### Data Cleaning

In [6]:
df.head(20)

Unnamed: 0_level_0,time,cogt,pt08s1co,nhmcgt,c6h6gt,pt08s2nhmc,noxgt,pt08s3nox,no2gt,pt08s4no2,pt08s503,t,rh,ah,x1,x2,x3,x4,x5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10/03/2004,18.00.00,2,6,1360,150,11,9,1046,166,1056,113,1692,1268,13,6,48.0,9.0,0.0,7578.0
10/03/2004,19.00.00,2,1292,112,9,4,955,103,1174,92,1559,972,13,3,47,7.0,0.0,7255.0,
10/03/2004,20.00.00,2,2,1402,88,9,0,939,131,1140,114,1555,1074,11,9,54.0,0.0,0.0,7502.0
10/03/2004,21.00.00,2,2,1376,80,9,2,948,172,1092,122,1584,1203,11,0,60.0,0.0,0.0,7867.0
10/03/2004,22.00.00,1,6,1272,51,6,5,836,131,1205,116,1490,1110,11,2,59.0,6.0,0.0,7888.0
10/03/2004,23.00.00,1,2,1197,38,4,7,750,89,1337,96,1393,949,11,2,59.0,2.0,0.0,7848.0
11/03/2004,00.00.00,1,2,1185,31,3,6,690,62,1462,77,1333,733,11,3,56.0,8.0,0.0,7603.0
11/03/2004,01.00.00,1,1136,31,3,3,672,62,1453,76,1333,730,10,7,60,0.0,0.0,7702.0,
11/03/2004,02.00.00,0,9,1094,24,2,3,609,45,1579,60,1276,620,10,7,59.0,7.0,0.0,7648.0
11/03/2004,03.00.00,0,6,1010,19,1,7,561,-200,1705,-200,1235,501,10,3,60.0,2.0,0.0,7517.0


In [7]:
df['cogt'] = [x if x>0 else np.nan for x in df['cogt']]

In [8]:
#summary like function in python
df.describe()

Unnamed: 0,cogt,pt08s1co,nhmcgt,c6h6gt,pt08s2nhmc,noxgt,pt08s3nox,no2gt,pt08s4no2,pt08s503,t,rh,ah,x1,x2,x3,x4,x5
count,6118.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9296.0,8991.0,8991.0,6915.0
mean,2.133377,240.766912,771.191621,-117.529016,1.601047,210.169285,678.366784,365.927007,589.606284,416.914396,1264.640697,763.919419,5.715293,6.279256,30.773128,3.678456,1206.577244,5245.150253
std,1.357061,455.94499,598.477832,147.670771,37.649516,408.532426,507.793366,351.573404,469.129574,623.40803,494.13376,581.462055,42.535162,46.226887,48.671023,3.037061,2639.347164,2871.284852
min,1.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,0.0,0.0,3.0
25%,1.0,3.0,39.0,-200.0,3.0,2.0,383.0,110.0,145.0,84.0,967.0,35.0,6.0,2.0,9.0,1.0,0.0,2990.0
50%,2.0,6.0,981.0,-200.0,6.0,6.0,791.0,247.0,695.0,124.0,1312.0,816.0,12.0,6.0,40.0,3.0,1.0,5341.0
75%,3.0,9.0,1171.0,4.0,11.0,9.0,1040.0,600.0,891.0,212.0,1595.0,1177.0,20.0,9.0,57.0,6.0,1.0,7684.5
max,11.0,2040.0,2008.0,1189.0,63.0,1889.0,2214.0,2331.0,2683.0,2684.0,2775.0,2523.0,44.0,87.0,88.0,9.0,9996.0,9998.0


In [9]:
df['cogt'] = [x if x>0 else np.nan for x in df['cogt']]
df['pt08s1co'] = [x if x>0 else np.nan for x in df['pt08s1co']]
df['nhmcgt'] = [x if x>0 else np.nan for x in df['nhmcgt']]
df['c6h6gt'] = [x if x>0 else np.nan for x in df['c6h6gt']]
df['pt08s2nhmc'] = [x if x>0 else np.nan for x in df['pt08s2nhmc']]
df['noxgt'] = [x if x>0 else np.nan for x in df['noxgt']]
df['pt08s3nox'] = [x if x>0 else np.nan for x in df['pt08s3nox']]
df['no2gt'] = [x if x>0 else np.nan for x in df['no2gt']]
df['pt08s4no2'] = [x if x>0 else np.nan for x in df['pt08s4no2']]
df['pt08s503'] = [x if x>0 else np.nan for x in df['pt08s503']]
df['t'] = [x if x>0 else np.nan for x in df['t']]
df['rh'] = [x if x>0 else np.nan for x in df['rh']]
df['ah'] = [x if x>0 else np.nan for x in df['ah']]
df['x1'] = [x if x>0 else np.nan for x in df['x1']]
df['x2'] = [x if x>0 else np.nan for x in df['x2']]

In [10]:
df.describe()

Unnamed: 0,cogt,pt08s1co,nhmcgt,c6h6gt,pt08s2nhmc,noxgt,pt08s3nox,no2gt,pt08s4no2,pt08s503,t,rh,ah,x1,x2,x3,x4,x5
count,6118.0,8981.0,7035.0,2846.0,8626.0,8246.0,7808.0,8901.0,7806.0,8900.0,8991.0,8991.0,8752.0,8305.0,8789.0,8991.0,8991.0,6915.0
mean,2.133377,252.205322,1091.746979,69.459241,8.80837,239.965317,852.622695,394.91956,746.495772,448.59191,1324.262373,803.158047,14.474863,15.888621,39.488793,3.678456,1206.577244,5245.150253
std,1.357061,461.634294,249.555878,145.296737,6.950894,426.181027,354.35314,335.688316,339.562703,622.934676,404.004376,559.013251,9.313988,20.894686,23.652949,3.037061,2639.347164,2871.284852
min,1.0,1.0,10.0,1.0,1.0,1.0,4.0,2.0,8.0,2.0,297.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0
25%,1.0,4.0,936.0,5.0,4.0,4.0,661.0,127.0,577.25,90.0,1011.0,385.0,7.0,3.0,20.0,1.0,0.0,2990.0
50%,2.0,6.0,1067.0,11.0,7.0,7.0,868.0,267.0,760.0,128.0,1336.0,837.0,13.0,6.0,42.0,3.0,1.0,5341.0
75%,3.0,9.0,1238.0,41.75,12.0,477.25,1090.0,625.0,934.0,229.0,1607.0,1192.5,21.0,9.0,58.0,6.0,1.0,7684.5
max,11.0,2040.0,2008.0,1189.0,63.0,1889.0,2214.0,2331.0,2683.0,2684.0,2775.0,2523.0,44.0,87.0,88.0,9.0,9996.0,9998.0


In [11]:
## removing all rows containing NaN
df = df.dropna()


In [12]:
df['noxgt'].describe()

count    501.000000
mean       5.059880
std        2.587742
min        1.000000
25%        3.000000
50%        5.000000
75%        7.000000
max        9.000000
Name: noxgt, dtype: float64

#### We can now assume our data is clean. 

### Data integration

Data integration is nothing but the concatenation of the two or more datasets.

It can be done by using concat method of pandas.

In [13]:
df.head()


Unnamed: 0_level_0,time,cogt,pt08s1co,nhmcgt,c6h6gt,pt08s2nhmc,noxgt,pt08s3nox,no2gt,pt08s4no2,pt08s503,t,rh,ah,x1,x2,x3,x4,x5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10/03/2004,18.00.00,2.0,6.0,1360.0,150.0,11.0,9.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.0,6.0,48.0,9.0,0.0,7578.0
10/03/2004,22.00.00,1.0,6.0,1272.0,51.0,6.0,5.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.0,2.0,59.0,6.0,0.0,7888.0
10/03/2004,23.00.00,1.0,2.0,1197.0,38.0,4.0,7.0,750.0,89.0,1337.0,96.0,1393.0,949.0,11.0,2.0,59.0,2.0,0.0,7848.0
11/03/2004,00.00.00,1.0,2.0,1185.0,31.0,3.0,6.0,690.0,62.0,1462.0,77.0,1333.0,733.0,11.0,3.0,56.0,8.0,0.0,7603.0
11/03/2004,07.00.00,1.0,1.0,1144.0,29.0,3.0,2.0,667.0,98.0,1490.0,82.0,1339.0,730.0,10.0,2.0,59.0,6.0,0.0,7417.0


In [14]:
df.dtypes

time           object
cogt          float64
pt08s1co      float64
nhmcgt        float64
c6h6gt        float64
pt08s2nhmc    float64
noxgt         float64
pt08s3nox     float64
no2gt         float64
pt08s4no2     float64
pt08s503      float64
t             float64
rh            float64
ah            float64
x1            float64
x2            float64
x3            float64
x4            float64
x5            float64
dtype: object

### Data Transformation

It deals with transforming of data types of the columns as per the need. Here all the codes are in float64 itself which will be beneficial for getting the matrix in machine learning.

### Data Model Building

In [15]:
iris= datasets.load_iris()


In [16]:
gnb = GaussianNB()

In [17]:
train = gnb.fit(iris.data, iris.target)

In [18]:
pred = train.predict(iris.data)

In [19]:
total = iris.data.shape[0]
success = (iris.target==pred).sum()
print("Accuracy = "+str((success/total)*100))

Accuracy = 96.0
