**Importing Libraries for preprocessing**

In [None]:
import numpy as np   
import pandas as pd 
import matplotlib.pyplot as plt  
import seaborn as sns
#Library for nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import string
string.punctuation

In [None]:
train_df=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe()

In [None]:
#We don't really need keyword , location as a parameter to our predictions
train_df.drop(train_df.iloc[:, 1:3], inplace=True, axis=1)

**Analysing the data**

In [None]:
train_df.head()

In [None]:
train_df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(train_df['target'].value_counts(), labels=['Disaster Tweets','Non-Disaster Tweets'])
plt.show()

In [None]:
train_df['num_characters'] = train_df['text'].apply(len)

In [None]:
train_df['num_words'] = train_df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
train_df['num_sentences'] = train_df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
train_df.head()

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(train_df[train_df['target'] == 0]['num_characters'])
sns.histplot(train_df[train_df['target'] == 1]['num_characters'],color='red')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(train_df[train_df['target'] == 0]['num_words'])
sns.histplot(train_df[train_df['target'] == 1]['num_words'],color='red')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(train_df[train_df['target'] == 0]['num_sentences'])
sns.histplot(train_df[train_df['target'] == 1]['num_sentences'],color='red')
plt.show()


**Preprocessing Texts**

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)    

In [None]:
#testing our preprocess
transform_text("Several people have perished from the earthquake till now.")

In [None]:
train_df['transformed text'] = train_df['text'].apply(transform_text)

In [None]:
train_df.head()

**Defining the model**

In [None]:
#Importing text vectorizers , the most commonly used Count and TFidf
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [None]:
X = tfidf.fit_transform(train_df['transformed text']).toarray()

In [None]:
X.shape

In [None]:
X

In [None]:
y = train_df['target'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB , MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
Nb_models = [gnb,mnb,bnb]

In [None]:
for model in Nb_models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(model)
    print("\nAccuracy Score:\n")
    print(accuracy_score(y_test,y_pred))
    print("\nConfusion Matrix:\n")
    print(confusion_matrix(y_test,y_pred))
    print("\nPrecision Score:\n")
    print(precision_score(y_test,y_pred))
    print("\n********************************************\n")

In [None]:
# We can see above that Gaussian Naive Bayes has yielded underwhelming result
# Next plan of action would be to try the most popularly used Scikit-Learn Classifiers and compare the results to end up with a single model at end.
# Parameters to consider: Accuracy , Precison , Performance and Processing Time

In [None]:
#Importing classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid')
knc = KNeighborsClassifier()
mnb = MultinomialNB()
bnb = BernoulliNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb,
    'NB1':bnb,
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

**Final Model Selection**

In [None]:
# Selecting the best results yet
final_models = {'lr': LogisticRegression(),
                'mnb': MultinomialNB(),
                'bnb' :BernoulliNB()} 

In [None]:
import time

def classify(clf, X_train, X_test, y_train, y_test):
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    precision = precision_score(y_test, y_pred)
    train_accuracy = clf.score(X_train, y_train)
    test_accuracy = accuracy_score(y_test, y_pred)
    return precision, train_accuracy, test_accuracy

for name, clf in final_models.items():
    i_precision, i_train_accuracy, i_test_accuracy = classify(clf, X_train, X_test, y_train, y_test)
    
    # check the time
    start = time.time()
    
    # fit and predict
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    
    # check the time
    end = time.time()
    
    # print out the scores and classification reports
    print('[{}]\nProcessing Time: {} secs | Test Accuracy: {} | Precision: {}\n'.format(name, 
                                                                                            round(end-start, 2),
                                                                                            round(i_test_accuracy, 2),
                                                                                            round(i_precision, 2)))

In [None]:
test_df=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_df.head()

In [None]:
test_df['transformed_text'] = test_df['text'].apply(transform_text)

In [None]:
eval=tfidf.transform(test_df['transformed_text']).toarray()

In [None]:
start = time.time()
final_pred =bnb.predict(eval)
end = time.time()
pred_time = end - start

print('Prediction time: {} secs'.format(round(pred_time, 2)))

In [None]:
submission = test_df[['id']].reset_index(drop=True)
submission['target'] = final_pred.astype('int64')
submission

In [None]:
submission.to_csv('submission.csv', index=False)