In [16]:
import pandas as pd
import numpy as np
 
# Read dataset
df = pd.read_csv('ds-home-30sec-avg-MH.csv')


In [17]:
# Display example observations
df.head()

Unnamed: 0,date,pre,hum,tem,occ
0,04/06/2020 18:45,94397.03,49.35,31.44,M
1,04/06/2020 18:28,94393.59,51.75,31.49,M
2,25/05/2020 16:00,93940.99,52.46,32.32,M
3,25/05/2020 16:29,93903.85,50.79,32.18,M
4,29/05/2020 00:25,95140.24,45.94,31.85,M


In [18]:
# Is the dataset umbalanced?
df['occ'].value_counts()

M    1188
H     230
Name: occ, dtype: int64

In [19]:
# Define a binary label
df["label"] = np.where(df["occ"].str.contains("H"), 1, 0)
df.head()

Unnamed: 0,date,pre,hum,tem,occ,label
0,04/06/2020 18:45,94397.03,49.35,31.44,M,0
1,04/06/2020 18:28,94393.59,51.75,31.49,M,0
2,25/05/2020 16:00,93940.99,52.46,32.32,M,0
3,25/05/2020 16:29,93903.85,50.79,32.18,M,0
4,29/05/2020 00:25,95140.24,45.94,31.85,M,0


In [20]:
#split dataset in features and target variable
feature_cols = ['pre', 'hum', 'tem']
X = df[feature_cols] # Features (independent variables)
y = df.label # Target variable

## Imbalanced Data

In [21]:
from sklearn.model_selection import train_test_split # Import train_test_split function

# 75% training and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

In [22]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# Predict labels
y_pred=logreg.predict(X_test)



In [23]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       303
           1       0.00      0.00      0.00        52

    accuracy                           0.85       355
   macro avg       0.43      0.50      0.46       355
weighted avg       0.73      0.85      0.79       355



  'precision', 'predicted', average, warn_for)


## Up-sampling

In [25]:
from sklearn.utils import resample

In [27]:
# Separate majority and minority classes
df_majority = df[df.label==0]
df_minority = df[df.label==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1188,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.label.value_counts()

1    1188
0    1188
Name: label, dtype: int64

In [28]:
feature_cols = ['pre', 'hum', 'tem']
X = df_upsampled[feature_cols] # Features (independent variables)
y = df_upsampled.label # Target variable

# 75% training and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# Predict labels
y_pred=logreg.predict(X_test)



In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.72      0.63       301
           1       0.60      0.44      0.51       293

    accuracy                           0.58       594
   macro avg       0.59      0.58      0.57       594
weighted avg       0.59      0.58      0.57       594



## Down-sampling

In [36]:
# Separate majority and minority classes
df_majority = df[df.label==0]
df_minority = df[df.label==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=230,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.label.value_counts()

1    230
0    230
Name: label, dtype: int64

In [37]:
feature_cols = ['pre', 'hum', 'tem']
X = df_downsampled[feature_cols] # Features (independent variables)
y = df_downsampled.label # Target variable

# 75% training and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# Predict labels
y_pred=logreg.predict(X_test)



In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.85      0.65        48
           1       0.81      0.43      0.56        67

    accuracy                           0.61       115
   macro avg       0.66      0.64      0.60       115
weighted avg       0.69      0.61      0.60       115

