In [None]:
# !pip uninstall tensorflow tensorflow_hub tensorflow_text
# !pip install tensorflow tensorflow_hub tensorflow_text
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow_hub import KerasLayer
import tensorflow_hub as hub
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow_text as text

In [None]:
df = pd.read_csv("nlp_ds.csv")

labelencoder = LabelEncoder()
df['Encoded_Label'] = labelencoder.fit_transform(df['Label'])

In [None]:
print(df['Encoded_Label'].value_counts())
print(df['Label'].value_counts())


5    571
1    533
0    435
3    371
4    299
2    267
Name: Encoded_Label, dtype: int64
waterbodies-spread     571
Littoral               533
Deciduous-woodlands    435
current fallow         371
plantation/orchard     299
Snowfall/Glacial       267
Name: Label, dtype: int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Query'], df['Encoded_Label'], test_size=0.3, shuffle=True)

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=1000)
X_test_sequences = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=1000)

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def encode_sentences(sentences):
    preprocessed_outputs = []
    for sentence in sentences:
        preprocessed_text = bert_preprocess(tf.constant([sentence]))
        output = bert_encoder(preprocessed_text)['pooled_output']
        preprocessed_outputs.append(output)
    return tf.concat(preprocessed_outputs, axis=0)

train_sentence_embeddings = encode_sentences(X_train)
test_sentence_embeddings = encode_sentences(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
train_features = train_sentence_embeddings.numpy().reshape(len(X_train), -1)
test_features = test_sentence_embeddings.numpy().reshape(len(X_test), -1)
from sklearn.model_selection import cross_val_score

rf_classifier = RandomForestClassifier(n_estimators=100,random_state=42)

# Use cross-validation to evaluate the model's performance
scores = cross_val_score(rf_classifier, train_features, y_train, cv=5)
print("Cross-Validation Scores:", scores)
print("Mean CV Score:", scores.mean())

rf_classifier.fit(train_features, y_train)

# Evaluate the model
train_accuracy = rf_classifier.score(train_features, y_train)
test_accuracy = rf_classifier.score(test_features, y_test)
print("Random Forest Train Accuracy:", train_accuracy)
print("Random Forest Test Accuracy:", test_accuracy)

Cross-Validation Scores: [0.77809798 0.77809798 0.76368876 0.76589595 0.76878613]
Mean CV Score: 0.7709133614299278
Random Forest Train Accuracy: 1.0
Random Forest Test Accuracy: 0.784656796769852


In [None]:
new_sentence = "Show unproductive land areas for soil improvement projects in Telangana."
preprocessed_new_sentence = bert_preprocess(tf.constant([new_sentence]))
new_sentence_embedding = bert_encoder(preprocessed_new_sentence)['pooled_output']
new_sentence_features = new_sentence_embedding.numpy().reshape(1, -1)
predicted_label = rf_classifier.predict(new_sentence_features)[0]
print("Predicted Label:", predicted_label)


Predicted Label: 3


In [None]:
from sklearn.metrics import accuracy_score
y_pred = rf_classifier.predict(test_features)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.784656796769852


In [None]:
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_pred, average='macro')
print("Precision (Macro Average):", precision)


Precision (Macro Average): 0.7865561797810088


In [None]:
from sklearn.metrics import recall_score

recall_macro = recall_score(y_test, y_pred, average='macro')
print("Recall (Macro Average):", recall_macro)

recall_micro = recall_score(y_test, y_pred, average='micro')
print("Recall (Micro Average):", recall_micro)

recall_weighted = recall_score(y_test, y_pred, average='weighted')
print("Recall (Weighted Average):", recall_weighted)


recall_per_class = recall_score(y_test, y_pred, average=None)
print("Recall (Per Class):", recall_per_class)


Recall (Macro Average): 0.760818261119553
Recall (Micro Average): 0.784656796769852
Recall (Weighted Average): 0.784656796769852
Recall (Per Class): [0.78518519 0.80981595 0.71621622 0.73076923 0.58762887 0.93529412]


In [None]:
from sklearn.metrics import f1_score

# For macro-average F1 score
f1_macro = f1_score(y_pred, y_pred, average='macro')
print("F1 Score (Macro Average):", f1_macro)

# For micro-average F1 score
f1_micro = f1_score(y_pred, y_pred, average='micro')
print("F1 Score (Micro Average):", f1_micro)

# For weighted-average F1 score
f1_weighted = f1_score(y_pred, y_pred, average='weighted')
print("F1 Score (Weighted Average):", f1_weighted)

# For calculating F1 score for each class separately, specify average=None
f1_per_class = f1_score(y_pred, y_pred, average=None)
print("F1 Score (Per Class):", f1_per_class)


F1 Score (Macro Average): 1.0
F1 Score (Micro Average): 1.0
F1 Score (Weighted Average): 1.0
F1 Score (Per Class): [1. 1. 1. 1. 1. 1.]


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print

[[106  11   2   1   5  10]
 [ 12 132   0   7   3   9]
 [  1   2  53   1   4  13]
 [  4   2   2  76  11   9]
 [  2  10   2  12  57  14]
 [  0   3   4   0   4 159]]


In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_true, y_score)
auc = roc_auc_score(y_true, y_score)


In [None]:
from joblib import dump


dump(rf_classifier, 'random_forest_model.joblib')
