In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

Collecting keras-core
  Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)
Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-core
Successfully installed keras-core-0.1.7
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.[0m[31m
[0m

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

2024-03-27 20:02:09.843610: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-27 20:02:09.843808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-27 20:02:10.011614: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using TensorFlow backend
TensorFlow version: 2.15.0
KerasNLP version: 0.8.2


In [3]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [4]:
submission_id = df_test['id']

In [5]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
df_train['location'].describe()

count     5080
unique    3341
top        USA
freq       104
Name: location, dtype: object

In [8]:
df_train['keyword'].describe()

count           7552
unique           221
top       fatalities
freq              45
Name: keyword, dtype: object

In [9]:
target = df_train['target']
df_train = df_train.drop(columns=['id'])
df_test = df_test.drop(columns=['id'])

In [10]:
df_train.fillna('', inplace=True)
df_test.fillna('',inplace=True)

In [11]:
"""

1. Preprocessing the dataset for ShallowML ex. case normalization, removing special characters
2. Feature extraction for ShallowML ex. Common text features:
-Vectorization: BOW, Bag of N-Grams, TF-IDF, Sentence Embeddings
-Calculations on the text: Length, number of nouns, readability scores, complexity scores, tone (e.g., LIWC)
-Meta data about the text: author, date, venue
3. Using any Machine Learning algorithm and getting the results as a base line model for later comparison

"""

import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def transform(self, X, **transform_params):
        X_lower = [text.lower() for text in X]   # Case Normalization
        X_processed = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in X_lower] # Removing the special characters
        return X_processed
    
    def fit(self, X, y=None, **fit_params):
        return self

class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def transform(self, X, **transform_params):
        return X[self.column]
    
    def fit(self, X, y=None, **fit_params):
        return self

X_train, X_test, y_train, y_test = train_test_split(df_train[['keyword','location','text']], target, test_size=0.2, random_state=42)

In [12]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('selector', ColumnExtractor(column='text')),
            ('preprocessor', TextPreprocessor()),
            ('vectorizer', TfidfVectorizer())
        ])),
        ('keyword', Pipeline([
            ('selector', ColumnExtractor(column='keyword')),
            ('vectorizer', TfidfVectorizer())
        ])),
        ('location', Pipeline([
            ('selector', ColumnExtractor(column='location')),
            ('vectorizer', TfidfVectorizer())
        ]))
    ])),
    ('classifier', RandomForestClassifier())
])


pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, predictions)
print("F1 Scores:", f1_score(y_test, predictions))
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, predictions))

F1 Scores: 0.7021276595744681
Accuracy: 0.7793827971109653
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.91      0.82       874
           1       0.83      0.61      0.70       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.76      1523
weighted avg       0.79      0.78      0.77      1523



In [14]:
preds = pipeline.predict_proba(df_test)

In [15]:
preds1 = []
for pred in preds:
    preds1.append(pred[-1])

In [16]:
preds_dict  = {
    "id": submission_id,
    "target": preds1
}
submission = pd.DataFrame(preds_dict)

In [17]:
submission.shape

(3263, 2)

In [18]:
submission.to_csv("/kaggle/working/submission2.csv",index=False)

In [19]:
submission.head()

Unnamed: 0,id,target
0,0,0.58
1,2,0.64
2,3,0.6
3,9,0.19
4,11,0.75
