In [1]:
#Importing the Necessary Libraries 
import pandas as pd 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 

In [3]:
#Creating a Pandas DataFrame from a CSV file 
data = pd.read_csv(r"K:\DATA SCIENCE\DataSets\TopMentor Datasets\data\data_weather.csv") 
print("Columns are: ",data.columns) 

Columns are:  Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')


In [4]:
print("Data: \n",data) 

Data: 
       number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
0          0        918.060000     74.822000              271.100000   
1          1        917.347688     71.403843              101.935179   
2          2        923.040000     60.638000               51.000000   
3          3        920.502751     70.138895              198.832133   
4          4        921.160000     44.294000              277.800000   
...      ...               ...           ...                     ...   
1090    1090        918.900000     63.104000              192.900000   
1091    1091        918.710000     49.568000              241.600000   
1092    1092        916.600000     71.096000              189.300000   
1093    1093        912.600000     58.406000              172.700000   
1094    1094        921.530000     77.702000               97.100000   

      avg_wind_speed_9am  max_wind_direction_9am  max_wind_speed_9am  \
0               2.080354              295.400000       

In [5]:
print("Null Data: \n",data[data.isnull().any(axis=1)]) 

Null Data: 
       number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
16        16        917.890000           NaN              169.200000   
111      111        915.290000     58.820000              182.600000   
177      177        915.900000           NaN              183.300000   
262      262        923.596607     58.380598               47.737753   
277      277        920.480000     62.600000              194.400000   
334      334        916.230000     75.740000              149.100000   
358      358        917.440000     58.514000               55.100000   
361      361        920.444946     65.801845               49.823346   
381      381        918.480000     66.542000               90.900000   
409      409               NaN     67.853833               65.880616   
517      517        920.570000     53.600000              100.100000   
519      519        916.250000     55.670000              176.400000   
546      546               NaN     42.746000       

## Daily Weather Data Description

The file daily_weather.csv is a comma-separated file that contains weather data. This data comes from a 
weather station. The weather station is equipped with sensors that capture weather-related measurements 
such as air temperature, air pressure, and relative humidity. Data was collected for a period of three years, 
from September 2011 to September 2014, to ensure that sufficient data for different seasons and weather 
conditions is captured. 

Let us now check all the columns in the data. 
* Each row in daily_weather.csv captures weather data for a separate day.
* Sensor measurements from the weather station were captured at one-minute intervals. These 
measurements were then processed to generate values to describe daily weather. Since this dataset
was created to classify low-humidity days vs. non-low-humidity days (that is, days with normal or 
high humidity), the variables included are weather measurements in the morning, with  e 
measurement, namely relatively humidity, in the afternoon. The idea is to use the morning weater 
values to predict whether the day will be low-humidity or not based on the afternoon measureent 
of relhtive humidity. 

### Each row, or sample, consists of the following variables:

* number: unique number for each row
* air_pressure_9am: air pressure averaged over a period from 8:55am to 9:04am (Unit: 
hectopascals
* air_temp_9am: air temperature averaged over a period from 8:55am to 9:04am (Unit: 
degrees Fahrenheit
* air_wind_direction_9am: wind direction averaged over a period from 8:55am to 9:04am 
(Unit: degrees, with 0 means coming from the North, and increasing clockwis
* air_wind_speed_9am: wind speed averaged over a period from 8:55am to 9:04am (Unit: 
miles per hour
* max_wind_direction_9am: wind gust direction averaged over a period from 8:55am to 
9:10am (Unit: degrees, with 0 being North and increasing clockwis*
* max_wind_speed_9am: wind gust speed averaged over a period from 8:55am to 9:04am 
(Unit: miles per hour
* rain_accumulation_9am: amount of rain accumulated in the 24 hours prior to 9am (Unit: 
millimeters
* rain_duration_9am: amount of time rain was recorded in the 24 hours prior to 9am (Unit: 
seconds
* relative_humidity_9am: relative humidity averaged over a period from 8:55am to 9:04am 
(Unit: percent
* relative_humidity_3pm: relative humidity averaged over a period from 2:55pm to 3:04pm 
*Unit: percen*))))) ))))

In [6]:
#Data Cleaning Steps 
#We will not need the "number" column for each row so we can clean it. 
del data['number'] 
#Let us drop null values using the pandas dropna function. 
before_rows = data.shape[0] 
print(before_rows) 
data = data.dropna() 
after_rows = data.shape[0] 
print(after_rows) 
#How many rows dropped due to cleaning? 
print("Total rows dropped: ",before_rows - after_rows)

1095
1064
Total rows dropped:  31


#Convert to a Classification Task 
#Binarize the relative_humidity_3pm to 0 or 1. 
clean_data = data.copy() 
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 24.99)*1 
print(clean_data['high_humidity_label']) 

In [9]:
#Target is stored in 'y'. 
y=clean_data[['high_humidity_label']].copy() 
clean_data['relative_humidity_3pm'].head() 
print("Y Data: \n",y.head())

Y Data: 
    high_humidity_label
0                    1
1                    0
2                    0
3                    0
4                    1


## Use 9am Sensor Signals as Features to Predict Humidity at 3pm

In [12]:
#Use 9am Sensor Signals as Features to Predict Humidity at 3pm 
morning_features = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am', 
        'rain_duration_9am'] 
 
X = clean_data[morning_features].copy() 
print("Columns in X: ",X.columns) 
print("Columns in Y: ",y.columns) 

Columns in X:  Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')
Columns in Y:  Index(['high_humidity_label'], dtype='object')


## Perform Test and Train split 

* In the training phase, the learning algorithm uses the training data to adjust the model’s parameters 
to minimize errors. At the end of the training phase, we get the trained model
* In the testing phase, the trained model is applied to test data. Test data is separate from the training 
data, and is previously unseen by the model. The model is then evaluated on how it performs on the
test data. The goal in building a classifier model is to have the model perform well on training as wel 
as test data. .

In [13]:
#Perform Test and Train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324) 
print ("X_train is as under:") 
print(X_train.head()) 
print ("X_test is as under:") 
print(X_test.head()) 
print ("y_train is as under:") 
print(y_train.head()) 
print ("y_test is as under:") 
print(y_test.head())

X_train is as under:
     air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
841        918.370000     72.932000              184.500000   
75         920.100000     53.492000              186.100000   
95         927.610000     54.896000               55.000000   
895        919.235153     65.951112              194.343333   
699        919.888128     68.687822              228.517730   

     avg_wind_speed_9am  max_wind_direction_9am  max_wind_speed_9am  \
841            2.013246              186.700000            2.773806   
75            13.444009              193.800000           15.367778   
95             4.988376               53.400000            7.202947   
895            2.942019              216.569792            3.658810   
699            3.960858              247.954028            5.185547   

     rain_accumulation_9am  rain_duration_9am  
841                    0.0                0.0  
75                     0.0                0.0  
95                     0.0   

In [14]:
print ("Let us describe y_train") 
y_train.describe()

Let us describe y_train


Unnamed: 0,high_humidity_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [15]:
#Fit on Train Set 
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0) 
humidity_classifier.fit(X_train, y_train) 
type(humidity_classifier) 

sklearn.tree._classes.DecisionTreeClassifier

## Predict on Test Set 

In [16]:
#Predict on Test Set 
predictions = humidity_classifier.predict(X_test) 
print("Sample Predictions: \n",predictions[:10]) 
print("Sample Y Test(Actual Data): \n",y_test['high_humidity_label'][:10]) 
#Measure Accuracy of the Classifier 
print("Accuracy: \n",accuracy_score(y_true = y_test, y_pred = 
predictions)) 

Sample Predictions: 
 [0 0 1 1 1 1 0 0 0 1]
Sample Y Test(Actual Data): 
 456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: high_humidity_label, dtype: int32
Accuracy: 
 0.8153409090909091
