In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import re
import string
import nltk    

# Visualization
import matplotlib.pyplot as plt    
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import seaborn as sns
import plotly.figure_factory as ff
from wordcloud import WordCloud
# Feature Enginerring
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
# Models
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score,confusion_matrix, ConfusionMatrixDisplay

In [None]:
# set testing dataset
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# data prepocessing - clearing & stemming
stemmer = nltk.SnowballStemmer("english")

def preprocess_text(text):
    # Remove URLs, special characters, and convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r'<.*?>',' ',text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub('@\S+', '', text)
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)  
    
    text = text.lower()
    text = text.split()
    
    text = [stemmer.stem(words) for words in text if words not in stopwords.words('english')]
    
    text = [i for i in text if len(i)>2] 
    text = ' '.join(text)
    return text

In [None]:
#Number of Target
plt.figure(figsize=(14,6))
sns.set_style('whitegrid')
sns.countplot(data=train_df, x='target')
plt.title('CountPlot for Target')
plt.tight_layout()
plt.show()

In [None]:
train_df['length'] = train_df['text'].apply(len)

In [None]:
plt.figure(figsize=(20,8))
sns.histplot(data=train_df,x='length',hue='target',kde= True,bins=70)
plt.title('Distrubution of text length')
plt.show()
plt.tight_layout()

In [None]:
# Grouping by Keywords to see the count of keywords
keyword = train_df.groupby('keyword')['target'].count()
df_key = pd.DataFrame({'keywords':keyword.index,'count':keyword.values}).sort_values(by='count',ascending=False)

# Top 20 keywords in the Tweets.
plt.figure(figsize=(14,5))
sns.barplot(data=df_key.head(20),x='keywords',y='count')
plt.xticks(rotation = 50)
plt.title('Top 20 keywords on Tweets')
plt.tight_layout()
plt.show()

In [None]:
# Grouping by Location to find the count of each location
location = train_df.groupby('location')['target'].count()
df_loc = pd.DataFrame({'location':location.index,'count':location.values}).sort_values(by='count',ascending=False)

# Top 20 Location in the tweets
plt.figure(figsize=(14,5))
sns.barplot(data=df_loc.head(20),x='location',y='count')
plt.xticks(rotation = 50)
plt.title('Top 20 locations of Tweets')
plt.tight_layout()
plt.show()

In [None]:
# handling NaN keyword & location data
train_df['keyword'].fillna('', inplace=True)
train_df['location'].fillna('', inplace=True)
test_df['keyword'].fillna('', inplace=True)
test_df['location'].fillna('', inplace=True)

In [None]:
# apply preprocess text
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [None]:
# preview preprocess result
train_df.head()

In [None]:
#Initialize disaster data set
disaster = ' '.join(train_df[train_df['target']==1]['text'])
non_disaster =''.join(train_df[train_df['target']==0]['text'])

In [None]:
#Wordcloud - disaster
plt.figure(figsize=(14,6))
wordcloud = WordCloud(width=1000,height=500,max_words=100).generate(disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='magma')
plt.axis('off')
plt.tight_layout()
plt.title('Disaster Wordcloud',fontsize= 25)
plt.show()

In [None]:
#Wordcloud - nondisaster
plt.figure(figsize=(14,6))
wordcloud = WordCloud(width=1000,height=500,max_words=100).generate(non_disaster)
plt.imshow(wordcloud,interpolation='bilinear',cmap='magma')
plt.axis('off')
plt.tight_layout()
plt.title('Non-Disaster Wordcloud',fontsize= 25)
plt.show()

In [None]:
# convert text to numerical vectors 
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))

X = tfidf_vectorizer.fit_transform(train_df['text']).toarray()
y = train_df['target']

In [None]:
# set train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=5)

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_val)

In [None]:
#Multinomial NB Score
accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred))

ConfusionMatrixDisplay(confusion_matrix(y_val,y_pred)).plot()
score_mnb = accuracy_score(y_val,y_pred)

In [None]:
rf = RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=2, n_estimators=50, random_state=42)
pipe_rf = make_pipeline(rf)
pipe_rf.fit(X_train,y_train)
pred = pipe_rf.predict(X_val)

In [None]:
#Random Forest Score
accuracy = accuracy_score(y_val, pred)

print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val,y_pred))
ConfusionMatrixDisplay(confusion_matrix(y_val,pred)).plot()
score_rf = accuracy_score(y_val,pred)

In [None]:
# Calculate ROC-AUC score for the validation set
roc_auc = roc_auc_score(y_val, mnb.predict_proba(X_val)[:, 1])
print("Multinomial NB - ROC-AUC Score:", roc_auc)

roc_auc = roc_auc_score(y_val, rf.predict_proba(X_val)[:, 1])
print("Random Forest Clasifier - ROC-AUC Score:", roc_auc)

In [None]:
#Model
data = {'modles': ['Random Forest Classifier','Multinomial NB'],
        'Score': [score_rf,score_mnb]}
df =pd.DataFrame(data)
df['Score'] =df['Score']*100
df.sort_values(by='Score',ascending=False)

In [None]:
# Convert text data in the test set into numerical vectors using TF-IDF
X_test = tfidf_vectorizer.transform(test_df['text']).toarray()

In [None]:
# Make predictions on the test set - MB
test_df['target'] = mnb.predict(X_test)

In [None]:
# Create the prediction output file containing id and target
submission_df = test_df[['id', 'target']]
submission_df.to_csv('submission.csv', index=False)