In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Process the data

In [3]:
# Loading data
file_path = "delhi_weather_cleaned.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,datetime_utc,conds,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wdire,wspdm
0,19961101-11:00,Haze,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,280.0,West,7.4
1,19961101-16:00,Haze,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0.0,North,0.0
2,19961101-18:00,Haze,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0.0,North,0.0
3,19961102-02:00,Haze,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,200.0,SSW,9.3
4,19961102-03:00,Haze,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,240.0,WSW,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80652,20170424-06:00,Haze,17.0,0,0,25.0,1005.0,0,0,34.0,0,0,4.0,320.0,NW,11.1
80653,20170424-09:00,Haze,14.0,0,0,16.0,1003.0,0,0,38.0,0,0,4.0,320.0,NW,22.2
80654,20170424-12:00,Haze,12.0,0,0,14.0,1002.0,0,0,36.0,0,0,4.0,270.0,West,18.5
80655,20170424-15:00,Haze,15.0,0,0,27.0,1004.0,0,0,32.0,0,0,2.0,320.0,NW,3.7


In [4]:
# Extracting the year 
def extract_year(value):
    return (value[0:4])
df["year"] = df["datetime_utc"].apply(lambda x:extract_year(x))

In [5]:
# Extracting month
def extract_month(value):
    return (value[4:6])
df["month"] = df["datetime_utc"].apply(lambda x:extract_month(x))

In [6]:
df.head()

Unnamed: 0,datetime_utc,conds,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wdire,wspdm,year,month
0,19961101-11:00,Haze,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,280.0,West,7.4,1996,11
1,19961101-16:00,Haze,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0.0,North,0.0,1996,11
2,19961101-18:00,Haze,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0.0,North,0.0,1996,11
3,19961102-02:00,Haze,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,200.0,SSW,9.3,1996,11
4,19961102-03:00,Haze,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,240.0,WSW,9.3,1996,11


In [7]:
df.dtypes

datetime_utc     object
conds            object
dewptm          float64
fog               int64
hail              int64
hum             float64
pressurem       float64
rain              int64
snow              int64
tempm           float64
thunder           int64
tornado           int64
vism            float64
wdird           float64
wdire            object
wspdm           float64
year             object
month            object
dtype: object

In [8]:
# Dropping three columns 'datetime_utc', 'wdire', 'Unnamed: 0'
df = df.drop(['datetime_utc', 'wdire'], axis =1)
df.head()

Unnamed: 0,conds,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wspdm,year,month
0,Haze,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,280.0,7.4,1996,11
1,Haze,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0.0,0.0,1996,11
2,Haze,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0.0,0.0,1996,11
3,Haze,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,200.0,9.3,1996,11
4,Haze,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,240.0,9.3,1996,11


In [9]:
df.columns

Index(['conds', 'dewptm', 'fog', 'hail', 'hum', 'pressurem', 'rain', 'snow',
       'tempm', 'thunder', 'tornado', 'vism', 'wdird', 'wspdm', 'year',
       'month'],
      dtype='object')

In [10]:
ordered_columns = ['year','month', 'dewptm', 'fog', 'hail', 'hum', 'pressurem', 'rain', 'snow',
       'tempm', 'thunder', 'tornado', 'vism', 'wdird', 'wspdm', 'conds']
df = df[ordered_columns]

In [11]:
df.head()

Unnamed: 0,year,month,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wspdm,conds
0,1996,11,9.0,0,0,27.0,1010.0,0,0,30.0,0,0,5.0,280.0,7.4,Haze
1,1996,11,11.0,0,0,47.0,1011.0,0,0,23.0,0,0,1.2,0.0,0.0,Haze
2,1996,11,13.0,0,0,60.0,1010.0,0,0,21.0,0,0,0.8,0.0,0.0,Haze
3,1996,11,10.0,0,0,52.0,1011.0,0,0,20.0,0,0,2.0,200.0,9.3,Haze
4,1996,11,10.0,0,0,46.0,1012.0,0,0,22.0,0,0,3.5,240.0,9.3,Haze


In [12]:
df.conds.value_counts()

Haze      65925
Rain       6022
Cloudy     5832
Clear      2878
Name: conds, dtype: int64

# Split the Data into Training and Testing

In [13]:
# Create our features
X = df.drop("conds", axis=1)

# Create our target
y = df["conds"]


In [14]:
X.describe()

Unnamed: 0,dewptm,fog,hail,hum,pressurem,rain,snow,tempm,thunder,tornado,vism,wdird,wspdm
count,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0,80657.0
mean,15.676445,0.050907,0.000136,55.171926,2199.73,0.027102,1.2e-05,26.340045,0.010216,2.5e-05,2.563318,169.093891,8.976131
std,7.150146,0.219809,0.011677,23.447379,355845.8,0.162383,0.003521,8.24278,0.100558,0.00498,22.690289,118.166457,12.08467
min,-24.0,0.0,0.0,4.0,-9999.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,0.0,0.0,36.0,1001.0,0.0,0.0,20.0,0.0,0.0,1.8,60.0,3.7
50%,15.0,0.0,0.0,55.0,1008.0,0.0,0.0,28.0,0.0,0.0,2.2,170.0,7.4
75%,22.0,0.0,0.0,74.0,1014.0,0.0,0.0,32.0,0.0,0.0,3.0,270.0,13.0
max,35.0,1.0,1.0,100.0,101061400.0,1.0,1.0,72.0,1.0,1.0,6436.0,960.0,1514.9


In [15]:
y.value_counts()

Haze      65925
Rain       6022
Cloudy     5832
Clear      2878
Name: conds, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# RandomOverSampler

In [18]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from collections import Counter
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Clear': 49517, 'Haze': 49517, 'Cloudy': 49517, 'Rain': 49517})

In [19]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
y_pred = model.predict(X_test)
y_pred

array(['Clear', 'Cloudy', 'Rain', ..., 'Haze', 'Haze', 'Rain'],
      dtype=object)

In [21]:
# Displaying the confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 359,   66,  113,  197],
       [ 351,  283,  195,  673],
       [5717, 3554, 4474, 2663],
       [  85,  186,  107, 1142]], dtype=int64)

In [22]:
# Calulating the accuracy_score
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.3103396974956608

In [23]:
# printing the classification report
from sklearn.metrics import classification_report
clf_report = (classification_report(y_test, y_pred))
print(clf_report)

              precision    recall  f1-score   support

       Clear       0.06      0.49      0.10       735
      Cloudy       0.07      0.19      0.10      1502
        Haze       0.92      0.27      0.42     16408
        Rain       0.24      0.75      0.37      1520

    accuracy                           0.31     20165
   macro avg       0.32      0.43      0.25     20165
weighted avg       0.77      0.31      0.38     20165



# DecisionTreeClassifier

In [24]:
from sklearn.tree import DecisionTreeClassifier
clf_model = DecisionTreeClassifier()

In [25]:
clf_model = clf_model.fit(X_train, y_train)
clf_model.score(X_train,y_train)

0.9991073199761952

In [26]:
clf_model.score(X_test,y_test)

0.9010166129432184

In [27]:
y_pred = clf_model.predict(X_test)
y_pred

array(['Haze', 'Haze', 'Haze', ..., 'Haze', 'Haze', 'Haze'], dtype=object)

In [28]:
# Displaying the confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  381,   161,   184,     9],
       [  160,   873,   408,    61],
       [  223,   437, 15603,   145],
       [    5,    64,   139,  1312]], dtype=int64)

In [29]:
# Calulating the accuracy_score
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.9010166129432184

In [30]:
# printing the classification report
from sklearn.metrics import classification_report
clf_report = (classification_report(y_test, y_pred))
print(clf_report)

              precision    recall  f1-score   support

       Clear       0.50      0.52      0.51       735
      Cloudy       0.57      0.58      0.57      1502
        Haze       0.96      0.95      0.95     16408
        Rain       0.86      0.86      0.86      1520

    accuracy                           0.90     20165
   macro avg       0.72      0.73      0.72     20165
weighted avg       0.90      0.90      0.90     20165



# RandomForestClassifier

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9991073199761952
Testing Score: 0.9342920902553931


In [33]:
y_pred = clf.predict(X_test_scaled)
y_pred

array(['Haze', 'Haze', 'Haze', ..., 'Haze', 'Haze', 'Haze'], dtype=object)

In [34]:
# Calulating the accuracy_score
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.9342920902553931

In [35]:
# Displaying the confusion_matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  384,   119,   232,     0],
       [   73,   892,   488,    49],
       [   56,   103, 16193,    56],
       [    3,    22,   124,  1371]], dtype=int64)

In [36]:
from sklearn.metrics import classification_report
clf_report = (classification_report(y_test, y_pred))
print(clf_report)

              precision    recall  f1-score   support

       Clear       0.74      0.52      0.61       735
      Cloudy       0.79      0.59      0.68      1502
        Haze       0.95      0.99      0.97     16408
        Rain       0.93      0.90      0.92      1520

    accuracy                           0.93     20165
   macro avg       0.85      0.75      0.79     20165
weighted avg       0.93      0.93      0.93     20165



# RandomUnderSampler

In [37]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'Clear': 2143, 'Cloudy': 2143, 'Haze': 2143, 'Rain': 2143})

In [38]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='newton-cg', random_state=1, max_iter=150)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=150, random_state=1, solver='newton-cg')

In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[  453,   128,   154,     0],
       [  462,   565,   319,   156],
       [ 2222,  2792, 10738,   656],
       [   36,    68,    23,  1393]], dtype=int64)

In [40]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6408439680709951

In [41]:
# printing the classification report
from sklearn.metrics import classification_report
clf_report = (classification_report(y_test, y_pred))
print(clf_report)

              precision    recall  f1-score   support

       Clear       0.14      0.62      0.23       735
      Cloudy       0.16      0.38      0.22      1502
        Haze       0.96      0.65      0.78     16408
        Rain       0.63      0.92      0.75      1520

    accuracy                           0.65     20165
   macro avg       0.47      0.64      0.50     20165
weighted avg       0.84      0.65      0.71     20165

