# Import libraries

In [4]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Load the Dataset

In [7]:
df = pd.read_csv('cleaned_twitter_disaster.csv')

In [9]:
df.head()

Unnamed: 0,id,keyword,location,text,target,char_length,word_length,cleaned_text,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13,our deeds are the reason of this earthquake ma...,"['our', 'deeds', 'are', 'the', 'reason', 'of',..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7,forest fire near la ronge sask canada,"['forest', 'fire', 'near', 'la', 'ronge', 'sas..."
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22,all residents asked to shelter in place are be...,"['all', 'residents', 'asked', 'to', 'shelter',..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8,13000 people receive wildfires evacuation orde...,"['13000', 'people', 'receive', 'wildfires', 'e..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16,just got sent this photo from ruby alaska as s...,"['just', 'got', 'sent', 'this', 'photo', 'from..."


In [11]:
df.shape

(7613, 9)

In [13]:
df.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'char_length',
       'word_length', 'cleaned_text', 'tokens'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7613 non-null   int64 
 1   keyword       7552 non-null   object
 2   location      5080 non-null   object
 3   text          7613 non-null   object
 4   target        7613 non-null   int64 
 5   char_length   7613 non-null   int64 
 6   word_length   7613 non-null   int64 
 7   cleaned_text  7613 non-null   object
 8   tokens        7613 non-null   object
dtypes: int64(4), object(5)
memory usage: 535.4+ KB


In [17]:
df.describe()

Unnamed: 0,id,target,char_length,word_length
count,7613.0,7613.0,7613.0,7613.0
mean,5441.934848,0.42966,101.037436,14.903586
std,3137.11609,0.49506,33.781325,5.732604
min,1.0,0.0,7.0,1.0
25%,2734.0,0.0,78.0,11.0
50%,5408.0,0.0,107.0,15.0
75%,8146.0,1.0,133.0,19.0
max,10873.0,1.0,157.0,31.0


# Feature Engineering

## 2.1. Basic Textual Features

### Word Frequencies (Bag of Words):

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(df['cleaned_text'])

In [23]:
X_bow

<7613x15738 sparse matrix of type '<class 'numpy.int64'>'
	with 94346 stored elements in Compressed Sparse Row format>

### TF-IDF (Term Frequency-Inverse Document Frequency):

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [28]:
X_tfidf

<7613x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 82213 stored elements in Compressed Sparse Row format>

## 2.2. BERT or Other Transformer Embeddings:

In [30]:
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

df['bert_embedding'] = df['cleaned_text'].apply(get_bert_embedding)

OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "C:\Users\skhai\anaconda3\Lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [None]:
df.head(5)

## 2.3 Sentiment Analysis: 

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
df.head(5)

## 2.4- Additional Features: Hashtags, Mentions

In [None]:
# Function to calculate additional features
def extract_additional_features(text):
    num_hashtags = len(re.findall(r"#\w+", text))
    num_mentions = len(re.findall(r"@\w+", text))
    return pd.Series([num_hashtags, num_mentions], index=['num_hashtags', 'num_mentions'])

In [None]:
# Apply function to both training and test sets
additional_future = df['text'].apply(extract_additional_features)
additional_future

## 2.3. Combining Features

In [None]:
from scipy.sparse import hstack
import numpy as np

In [None]:
# Combine BoW/TF-IDF with other features
X_combined = hstack([X_tfidf,additional_future, np.array(df[['char_length','sentiment']])])

In [None]:
X_combined

## Split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X = X_combined
y = df['target'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2.5- Train Models and Evaluate Performance

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

### 1. Logistic Regression

In [None]:
from sklearn.metrics import classification_report

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred_log = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred_log))

### 2. Random Forest

In [None]:
random_model = RandomForestClassifier()
random_model.fit(X_train, y_train)
y_pred_ran = random_model.predict(X_test)
print(classification_report(y_test, y_pred_ran))

### 3. neural network model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
model_neural = Sequential()
model_neural.add(Dense(units=64, activation='relu'))
model_neural.add(Dense(units=32, activation='relu'))
model_neural.add(Dense(units=16, activation='relu'))
model_neural.add(Dense(units=1, activation='sigmoid'))

model_neural.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

history = model_neural.fit(X_train, y_train, epochs=50, batch_size=32)

In [None]:
# 6. Evaluate the model
test_loss, test_accuracy = model_neural.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.2f}")

## 2.6- Hyperparameter Tuning Using Grid Search 

### 1. Logistic Regression Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
logistic_model = LogisticRegression()
# Set up the parameter grid for Grid Search
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
    'C': [0.01, 0.1, 1, 10, 100],                  
    'solver': ['liblinear', 'saga'],               
    'max_iter': [100, 200, 500]                    
}
# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model after tuning
best_logistic_model = grid_search.best_estimator_

In [None]:
# Make predictions with the best model
y_pred_log = best_logistic_model.predict(X_test)

In [None]:
# Print classification report
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred_log))

### 2. Random Forest Hyperparameter Tuning

In [None]:
# Define the model
random_model = RandomForestClassifier(random_state=42)
# Set up the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],               
    'max_depth': [None, 10, 20, 30],             
    'min_samples_split': [2, 5, 10],              
    'min_samples_leaf': [1, 2, 4],                
    'max_features': ['sqrt', 'log2'],           
    'bootstrap': [True, False]                  
}
# Set up GridSearchCV with cross-validation
grid_search_ran = GridSearchCV(random_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search to the data
grid_search_ran.fit(X_train, y_train)

In [None]:
# Get the best model after tuning
best_random_model = grid_search_ran.best_estimator_

In [None]:
# Make predictions with the best model
y_pred_ran = best_random_model.predict(X_test)

In [None]:
# Print classification report
print("Best Parameters:", grid_search_ran.best_params_)
print(classification_report(y_test, y_pred_ran))

### 3. neural network model Hyperparameter Tuning

In [None]:
def create_model():
    model_neural_tue = Sequential()
    model_neural_tue.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model_neural_tue.add(Dense(1))  # Single output for regression
    model_neural_tue.compile(optimizer='adam', loss='mse')
    return model_neural_tue

In [None]:
from scikeras.wrappers import KerasRegressor
model_neural_tue = KerasRegressor(model=create_model, epochs=10, batch_size=10, verbose=0)
param_grid = {
    'batch_size': [10, 20, 40],
    'epochs': [10, 20, 50]
}

In [None]:
# Initialize RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model_neural_tue, param_distributions=param_grid, n_iter=10, error_score='raise')

In [None]:
# Perform the random search
random_result = random_search.fit(X_train, y_train)

In [None]:
accuracy = random_result.score(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')

## 2.7- Save the Best Model 

In [None]:
import pickle 
pickle.dump(best_logistic_model,open('regression_model.pkl','wb'))
print(type(best_logistic_model)) 

# Part 3: Model Evaluation and Validation

## 3.1- Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_log)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Disaster', 'Disaster'], yticklabels=['Non-Disaster', 'Disaster'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## 3.2- ROC Curve and AUC (Area Under the Curve)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_prob = best_logistic_model.predict_proba(X_test)[:, 1]  
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
auc_score = roc_auc_score(y_test, y_pred_prob)

plt.plot(fpr, tpr, color="blue", label=f"AUC = {auc_score:.4f}")
plt.plot([0, 1], [0, 1], color="red", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

## 3.3- Precision-Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
plt.plot(recall, precision, marker='.', label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()