<a href="https://colab.research.google.com/github/rsa-umn/data-mining/blob/main/Data-Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Mining Group Project



Dataset source: https://github.com/rsa-umn/data-mining/blob/main/RICE.csv
(This needs to be loaded into Colab occasionally when instance is restarted)

In [None]:
#INIT
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve


df_pests = pd.read_csv('RICE.csv',encoding="utf-8")
df_pests

Unnamed: 0,Observation Year,Standard Week,Pest Value,Collection Type,MaxT,MinT,RH1(%),RH2(%),RF(mm),WS(kmph),SSH(hrs),EVP(mm),PEST NAME,Location
0,2003,1,0.0,Number/hill,27.9,14.8,94.7,51.3,0.0,3.1,8.7,2.7,Brownplanthopper,Cuttack
1,2003,2,0.0,Number/hill,27.2,15.0,93.9,53.1,0.0,2.1,8.7,2.4,Brownplanthopper,Cuttack
2,2003,3,0.0,Number/hill,28.7,18.3,94.1,56.7,0.6,4.8,6.7,2.8,Brownplanthopper,Cuttack
3,2003,4,0.0,Number/hill,25.3,16.4,90.9,57.4,0.3,6.5,5.3,3.0,Brownplanthopper,Cuttack
4,2003,5,0.0,Number/hill,28.8,18.7,95.7,55.0,0.0,5.2,6.3,2.8,Brownplanthopper,Cuttack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19399,2011,48,31.0,Number/Light trap,30.2,17.2,78.1,41.6,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar
19400,2011,49,38.0,Number/Light trap,30.5,12.9,84.3,33.4,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar
19401,2011,50,6.0,Number/Light trap,30.2,14.2,83.4,49.6,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar
19402,2011,51,13.0,Number/Light trap,29.0,11.8,81.9,49.3,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar


# Correlation

Simple correlation of weather features with classifying mere presence of some pests, e.g. `has_pests == True`. The findings below suggest that there is weak correlation to weather features in a given moment, with the highest being humidity2 (night time humidity I think) at 0.28, followed by min temperature and day time humidity. Despite weak correlation, we could use those features as part of our decision tree and see what happens. There is also the concept of weather lag that we should consider, where we may see better results a bit after the temperatures begin to change. -Ray

Transforming weeks to numerical seasons (1-4) improved correlation to .19 for time of year. -Ray

In [None]:
# convert pest type to labels
le = LabelEncoder()
df_pests["pest_name_encoded"] = le.fit_transform(df_pests["PEST NAME"])
df_pests["location_encoded"] = le.fit_transform(df_pests["Location"])

# for later transforming pest_value to boolean 'has_pests'
def is_pest_present(pest_value):
  return (pest_value > 0.0)

# transform std_week to season (1=winter, 2=spring, 3=summer, 4=fall)
def week_to_season(std_week):
  if 9 <= std_week <= 21: # spring
    return 2
  elif 22 <= std_week <= 34: # summer
    return 3
  elif 35 <= std_week <= 48: # fall
    return 4
  else: # winter
    return 1

# average out temperatures
def get_avg_temp(min_temp, max_temp):
  return

# print performance measures
def print_performance_measures(y_true, y_pred):
  print(f"Accuracy is {accuracy_score(y_true, y_pred)}")
  print(f'Precision is {precision_score(y_true, y_pred, average="weighted")}')
  print(f"Recall is {recall_score(y_true, y_pred)}")
  print(f"F-measure is {f1_score(y_true, y_pred)}")
  print_tprfpr(y_true, y_pred)

# print TPR-FPR ratio
def print_tprfpr(y_true, y_pred):
  cm = confusion_matrix(y_true, y_pred)

  tn = cm[0, 0]
  fp = cm[0, 1]
  fn = cm[1, 0]
  tp = cm[1, 1]

  tpr = tp / (tp + fn)
  fpr = fp / (tp + fn)

  print(f"TPR-FPR is {tpr/fpr}")

df_pests["has_pests"] = df_pests["Pest Value"].apply(is_pest_present)
df_pests["season"] = df_pests["Standard Week"].apply(week_to_season)
df_pests["avg_temp"] = (df_pests["MinT"] + df_pests["MaxT"]) / 2

# first figure out the features that might be correlated
cols = ["season", "avg_temp", "MaxT", "MinT", "RH1(%)", "RH2(%)", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)", "has_pests"]

# next, create matrix from these hopefully relevant features
df_pest_correlates = df_pests[cols]

# compute correlation matrix
df_pest_correlation_matrix = df_pest_correlates.corr()

# show correlations to pest presence
print(df_pest_correlation_matrix.has_pests.sort_values(ascending=False))

has_pests    1.000000
RH2(%)       0.287117
MinT         0.266343
RH1(%)       0.263141
season       0.194540
avg_temp     0.190736
MaxT         0.071920
RF(mm)       0.032183
EVP(mm)      0.021651
SSH(hrs)    -0.071498
WS(kmph)    -0.123536
Name: has_pests, dtype: float64


In [None]:
# let's get records with actual pests
df_pests_with_pests = df_pests[df_pests["Pest Value"] > 0].copy()

# let's get correlates for pest_name
cols = ["season", "Pest Value", "location_encoded", "avg_temp", "MaxT", "MinT", "RH1(%)", "RH2(%)", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)", "pest_name_encoded"]
df_pest_correlates = df_pests_with_pests[cols]
df_pest_correlation_matrix = df_pest_correlates.corr()
print(df_pest_correlation_matrix.pest_name_encoded.sort_values(ascending=False))

pest_name_encoded    1.000000
location_encoded     0.053611
WS(kmph)             0.047904
MaxT                 0.033622
SSH(hrs)             0.027678
EVP(mm)              0.004680
avg_temp             0.000982
RF(mm)              -0.010505
MinT                -0.032490
season              -0.056605
RH1(%)              -0.057984
RH2(%)              -0.088579
Pest Value          -0.101660
Name: pest_name_encoded, dtype: float64


# Decision Tree v0.0.1
From the correlations above, we can try to use `humidity1, humidity2, min_temp` as our features and see how the DT classifies. This needs to be improved if we want to predict type of pest instead of mere pest presence.

In [None]:
cols = ["RH2(%)", "MinT", "RH1(%)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = tree.DecisionTreeClassifier(max_depth=5, random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_tree.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7159051872208863
Precision is 0.7145204410295117
Recall is 0.797804208600183
F-measure is 0.7598024978216672
TPR-FPR is 2.639757820383451


# Decision Tree v0.0.2
Playing around with adding season as a feature. Improved accuracy by 1%. Tried putting avg_temp in but that actually brought accuracy back down. This makes sense because while min_temp had higher correlation to pest presence, max_temp had very low correlation. -Ray


In [None]:
cols = ["season", "RH2(%)", "MinT", "RH1(%)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = tree.DecisionTreeClassifier(max_depth=5, random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_tree.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7214015802129853
Precision is 0.7214672938865965
Recall is 0.8240317169868863
F-measure is 0.7691431824651295
TPR-FPR is 2.585645933014354


# Decision Tree v0.0.3
Attempting almost all features mostly to see what happens.
1. `collect_type, pest_name, location` don't work out of the box. They need to be transformed to float.
1. Trying the rest.

Accuracy improved to 75% on all quantitative features. I am not sure exactly why. May have to do with each piece of the puzzle coming together? -Ray


In [None]:
cols = ["Observation Year", "season", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = tree.DecisionTreeClassifier(max_depth=5, random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_tree.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7502576434215046
Precision is 0.7531198023177428
Recall is 0.8594083562061604
F-measure is 0.7949224259520451
TPR-FPR is 2.837865055387714


# Decision Tree v0.0.4
Changing `max_depth` to `11` improved accuracy to ~77%. Slowly gained accuracy points with each additional layer. Shouldn't be overfitting if I understand correctly since it's going through cross-validation and accuracy is based on testing data. Also added `criterion="entropy"` as opposed to Gini, which improved F-measure from 2.7 to 2.8 -Ray


In [None]:
cols = ["Observation Year", "season", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = tree.DecisionTreeClassifier(max_depth=11, criterion="entropy", random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_tree.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7705255925798694
Precision is 0.7697236812190575
Recall is 0.8191521805428484
F-measure is 0.800834824090638
TPR-FPR is 3.6150740242261103


# Decision Tree v0.1.0
Hoping to get DT here to predict actual pest type here instead of pest presence.

## Brain Storm
1. remove records with `pest_value=0`
2. Convert pest_type to index values.
3. Build DT to predict `y = df_pests["pest_name"]`


In [None]:
cols = ["Pest Value", "season", "location_encoded", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests_with_pests[cols]

#initialize tree classifier and training parameters
clf_tree = tree.DecisionTreeClassifier(max_depth=11, random_state=42)
y = df_pests_with_pests["pest_name_encoded"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_tree.predict(X_holdout)
print(f"Accuracy is {accuracy_score(y_holdout, predictions)}")
print(f"Precision is {precision_score(y_holdout, predictions, average="weighted")}")
print(f"Recall is {recall_score(y_holdout, predictions, average="weighted")}")
print(f"F-measure is {f1_score(y_holdout, predictions, average="weighted")}")
df_pests_with_pests

Accuracy is 0.36914001862775536
Precision is 0.3442826356037169
Recall is 0.36914001862775536
F-measure is 0.35114450929314683


Unnamed: 0,Observation Year,Standard Week,Pest Value,Collection Type,MaxT,MinT,RH1(%),RH2(%),RF(mm),WS(kmph),SSH(hrs),EVP(mm),PEST NAME,Location,pest_name_encoded,location_encoded,has_pests,season,avg_temp
36,2003,37,2.9,Number/hill,28.9,25.2,95.7,86.6,101.7,4.6,1.0,1.9,Brownplanthopper,Cuttack,0,0,True,4,27.05
37,2003,38,9.7,Number/hill,31.7,25.8,91.4,77.3,50.2,3.9,6.8,3.8,Brownplanthopper,Cuttack,0,0,True,4,28.75
38,2003,39,19.4,Number/hill,32.9,26.5,91.1,74.0,19.2,3.0,7.5,4.0,Brownplanthopper,Cuttack,0,0,True,4,29.70
39,2003,40,9.0,Number/hill,32.3,25.7,88.3,72.9,9.8,2.4,6.7,3.7,Brownplanthopper,Cuttack,0,0,True,4,29.00
40,2003,41,6.0,Number/hill,32.7,25.9,90.0,71.6,2.7,2.1,7.7,3.8,Brownplanthopper,Cuttack,0,0,True,4,29.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19399,2011,48,31.0,Number/Light trap,30.2,17.2,78.1,41.6,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar,9,5,True,4,23.70
19400,2011,49,38.0,Number/Light trap,30.5,12.9,84.3,33.4,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar,9,5,True,1,21.70
19401,2011,50,6.0,Number/Light trap,30.2,14.2,83.4,49.6,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar,9,5,True,1,22.20
19402,2011,51,13.0,Number/Light trap,29.0,11.8,81.9,49.3,0.0,0.0,0.0,0.0,Yellowstemborer,Rajendranagar,9,5,True,1,20.40


# Dataset Exploration

**Unique Pests**
- 11 unique pests in the dataset.
- Yellow stem borer moth the most _measured_ (not necessarily the highest count)
- Neck blast fungus least measured.

In [None]:
df_pests["PEST NAME"].value_counts()

Unnamed: 0_level_0,count
PEST NAME,Unnamed: 1_level_1
Yellowstemborer,4333
Gallmidge,3016
Greenleafhopper,2287
LeafBlast,2090
Brownplanthopper,1958
LeafFolder,1716
Whitebackedplanthopper,1248
Miridbug,1144
Caseworm,936
ZigZagleafhopper,468


**Locations**
- 6 regions surveyed in different parts of India
- Maruteru most surveyed (7053 records)
- Palampur least (1248 records)

In [None]:
df_pests['Location'].value_counts()

Unnamed: 0_level_0,count
Location,Unnamed: 1_level_1
Maruteru,7053
Rajendranagar,5539
Raipur,2132
Ludhiana,1976
Cuttack,1456
Palampur,1248


**Time Periods**
- between 1959 - 2011
- early years have significantly less observations


In [None]:
df_pests['Observation Year'].value_counts().sort_index()

Unnamed: 0_level_0,count
Observation Year,Unnamed: 1_level_1
1959,52
1960,52
1961,52
1962,52
1963,52
1964,52
1965,52
1966,52
1967,52
1968,52


In [None]:
from sklearn.ensemble import RandomForestClassifier
cols = ["RH2(%)", "MinT", "RH1(%)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = RandomForestClassifier(max_depth=5, random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

In [None]:
predictions = clf_tree.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7179663345929234
Precision is 0.7170723582003761
Recall is 0.8096980786825252
F-measure is 0.7638089758342923
TPR-FPR is 2.608055009823183


In [None]:
cols = ["Observation Year", "season", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests[cols]

#initialize tree classifier and training parameters
clf_tree = RandomForestClassifier(max_depth=11, criterion="entropy", random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

#fit the model to the training data
clf_tree.fit(X_train, y_train)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

cols = ["Observation Year", "season", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests[cols]

# Initialize classifier and training parameters
clf_gb = GradientBoostingClassifier(learning_rate=0.05,
    max_depth=11, random_state=42)
y = df_pests["has_pests"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

# Fit the model to the training data
clf_gb.fit(X_train, y_train)

In [None]:
# Predict on unseen data
predictions = clf_gb.predict(X_holdout)
print_performance_measures(y_holdout, predictions)

Accuracy is 0.7634833390587427
Precision is 0.7626060488187453
Recall is 0.8148825861543153
F-measure is 0.7951197738431781
TPR-FPR is 3.47012987012987


In [None]:
cols = ["Observation Year", "season", "RH1(%)", "RH2(%)", "MaxT", "MinT", "RF(mm)", "WS(kmph)", "SSH(hrs)", "EVP(mm)"]
df_relevant_features = df_pests[cols]

# Initialize classifier and training parameters
clf_gb = GradientBoostingClassifier(learning_rate=0.05,
    max_depth=11, random_state=42)
y = df_pests["PEST NAME"]
X_train, X_holdout, y_train, y_holdout = train_test_split(df_relevant_features.values, y, test_size=0.3,
random_state=42)

# Fit the model to the training data
clf_gb.fit(X_train, y_train)

In [None]:
predictions = clf_gb.predict(X_holdout)

# Redefine print_performance_measures with the corrected average parameter for multiclass classification
def print_performance_measures(y_true, y_pred):
  print(f"Accuracy is {accuracy_score(y_true, y_pred)}")
  print(f'Precision is {precision_score(y_true, y_pred, average="weighted")}')
  print(f"Recall is {recall_score(y_true, y_pred, average="weighted")}") # Changed line
  print(f"F-measure is {f1_score(y_true, y_pred, average="weighted")}") # Changed line
  # print_tprfpr(y_true, y_pred) # Commented out as it's not directly applicable for multiclass

print_performance_measures(y_holdout, predictions)

Accuracy is 0.2030230161456544
Precision is 0.2201198574806025
Recall is 0.2030230161456544
F-measure is 0.20891634002704584
