In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from collections import Counter

In [3]:
# Loading data
file_path = "delhi_weather_cleaned.csv"
df = pd.read_csv(file_path, index_col=0)
df

Unnamed: 0,datetime_utc,conds,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wdire,wspdm
0,19961101-11:00,Haze,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,280.0,West,7.4
4,19961101-16:00,Haze,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0.0,North,0.0
6,19961101-18:00,Haze,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0.0,North,0.0
14,19961102-02:00,Haze,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,200.0,SSW,9.3
15,19961102-03:00,Haze,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,240.0,WSW,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100985,20170424-06:00,Haze,17.0,0,0,25.0,1005.0,0,0,34.0,0,0,4.0,320.0,NW,11.1
100986,20170424-09:00,Haze,14.0,0,0,16.0,1003.0,0,0,38.0,0,0,4.0,320.0,NW,22.2
100987,20170424-12:00,Haze,12.0,0,0,14.0,1002.0,0,0,36.0,0,0,4.0,270.0,West,18.5
100988,20170424-15:00,Haze,15.0,0,0,27.0,1004.0,0,0,32.0,0,0,2.0,320.0,NW,3.7


In [4]:
df.columns

Index(['datetime_utc', 'conds', 'dewptm', 'fog', 'hail', 'hum', 'pressurem',
       'rain', 'snow', 'tempm', 'thunder', 'tornado', 'vism', 'wdird', 'wdire',
       'wspdm'],
      dtype='object')

In [5]:
df.dtypes

datetime_utc     object
conds            object
dewptm          float64
fog               int64
hail              int64
hum             float64
pressurem       float64
rain              int64
snow              int64
tempm           float64
thunder           int64
tornado           int64
vism            float64
wdird           float64
wdire            object
wspdm           float64
dtype: object

In [6]:
print(df['wdire'].value_counts())
print("No of unique directions:",len(df["wdire"].unique()))

North       15701
West        11450
WNW          7232
East         7038
NW           6946
WSW          5202
ESE          5195
ENE          3581
SE           3427
SW           3072
NNW          2625
NE           2410
SSE          1981
SSW          1656
NNE          1635
South        1501
Variable        5
Name: wdire, dtype: int64
No of unique directions: 17


In [7]:
# Dimension reduction
df["wdire"]=df["wdire"].replace(["WNW", "WSW", "ESE", "ENE", "NNW", "SSE", "NNE" ,"SSW", "Variable"], ["West", "West", "East", "East", "North", "South", "North", "South", "North"])

In [8]:
print(df['wdire'].value_counts())

West     23884
North    19966
East     15814
NW        6946
South     5138
SE        3427
SW        3072
NE        2410
Name: wdire, dtype: int64


In [9]:
deg=45
df["wdire"]=df["wdire"].replace(["North","NE", "East","SE", "South","SW", "West", "NW"], [0, deg, 2*deg, 3*deg, 4*deg, 5*deg, 6*deg, 7*deg])

In [10]:
df["wdire"].value_counts()

270    23884
0      19966
90     15814
315     6946
180     5138
135     3427
225     3072
45      2410
Name: wdire, dtype: int64

Reduced the amount of variables in the 'wdire' column vs creating  new features using pd.get_dummies.

In [11]:
# Dropping wdird because it is now a duplicate.
df = df.drop(['datetime_utc', 'wdird'], axis =1)
df.head()

Unnamed: 0,conds,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdire,wspdm
0,Haze,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,270,7.4
4,Haze,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0,0.0
6,Haze,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0,0.0
14,Haze,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,180,9.3
15,Haze,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,270,9.3


In [12]:
df.describe()

Unnamed: 0,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdire,wspdm
count,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0
mean,15.676445,0.050907,0.000136,55.171926,2199.73,0.027102,1.2e-05,26.340045,0.010216,2.5e-05,2.563318,151.841316,8.976131
std,7.150146,0.219809,0.011677,23.447379,355845.8,0.162383,0.003521,8.24278,0.100558,0.00498,22.690289,116.152996,12.08467
min,-24.0,0.0,0.0,4.0,-9999.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,0.0,0.0,36.0,1001.0,0.0,0.0,20.0,0.0,0.0,1.8,45.0,3.7
50%,15.0,0.0,0.0,55.0,1008.0,0.0,0.0,28.0,0.0,0.0,2.2,135.0,7.4
75%,22.0,0.0,0.0,74.0,1014.0,0.0,0.0,32.0,0.0,0.0,3.0,270.0,13.0
max,35.0,1.0,1.0,100.0,101061400.0,1.0,1.0,72.0,1.0,1.0,6436.0,315.0,1514.9


# Splitting the Data into Training and Testing

In [13]:
# Creating features
X = df.drop(columns='conds')
# Creating target
y = df['conds']

In [14]:
X.describe()

Unnamed: 0,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdire,wspdm
count,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0
mean,15.676445,0.050907,0.000136,55.171926,2199.73,0.027102,1.2e-05,26.340045,0.010216,2.5e-05,2.563318,151.841316,8.976131
std,7.150146,0.219809,0.011677,23.447379,355845.8,0.162383,0.003521,8.24278,0.100558,0.00498,22.690289,116.152996,12.08467
min,-24.0,0.0,0.0,4.0,-9999.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,0.0,0.0,36.0,1001.0,0.0,0.0,20.0,0.0,0.0,1.8,45.0,3.7
50%,15.0,0.0,0.0,55.0,1008.0,0.0,0.0,28.0,0.0,0.0,2.2,135.0,7.4
75%,22.0,0.0,0.0,74.0,1014.0,0.0,0.0,32.0,0.0,0.0,3.0,270.0,13.0
max,35.0,1.0,1.0,100.0,101061400.0,1.0,1.0,72.0,1.0,1.0,6436.0,315.0,1514.9


In [15]:
# Checking the balance of the target values
df['conds'].value_counts()

Haze      65925
Rain       6022
Cloudy     5832
Clear      2878
Name: conds, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Clear': 2143, 'Haze': 49517, 'Cloudy': 4330, 'Rain': 4502})

# Naive Random Oversampling

In [35]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Clear': 49517, 'Haze': 49517, 'Cloudy': 49517, 'Rain': 49517})

In [36]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [37]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

confusion_matrix(y_test, y_pred)

array([[ 258,  214,   13,  250],
       [ 235,  564,   58,  645],
       [3624, 6846, 1914, 4024],
       [  15,  310,   84, 1111]], dtype=int64)

In [38]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
bal_acc_score=balanced_accuracy_score(y_test, y_pred)
bal_acc_score

0.3935228023619668

In [39]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
ovr_class_rpt=(classification_report_imbalanced(y_test, y_pred))
print(ovr_class_rpt)

                   pre       rec       spe        f1       geo       iba       sup

      Clear       0.06      0.35      0.80      0.11      0.53      0.27       735
     Cloudy       0.07      0.38      0.61      0.12      0.48      0.22      1502
       Haze       0.93      0.12      0.96      0.21      0.33      0.10     16408
       Rain       0.18      0.73      0.74      0.29      0.73      0.54      1520

avg / total       0.77      0.19      0.91      0.20      0.38      0.15     20165

