In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries & Functions**

In [None]:
import nltk
import pandas as pd
import re
import string 
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import warnings

# **Loading the datasets**

In [None]:
warnings.filterwarnings("ignore") # To ignore warnings

#reading train and test csv files
train=pd.read_csv(r'/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv(r'/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
train.dtypes

# **Data Visualization**

In [None]:
sns.set_theme(style="darkgrid")
plot=sns.countplot(x='target', data=train)
plot.set_title("Disaster tweets - 1, Non-disaster tweets - 0")

In [None]:
#Visualizing words from disaster tweets

#Removing stop words and some non relevant words
eliminated_words_d=nltk.corpus.stopwords.words('english') + ['http', 'co','https','new','like','via', 'U','amp']
plt.figure(figsize = (15,8))
wc = WordCloud(max_words = 600 ,
               width = 500 , 
               height = 300, 
               stopwords = eliminated_words_d).generate(" ".join(train[train.target == 1].text))

plt.imshow(wc , interpolation = 'bilinear')

In [None]:
#Visualizing words from non-disaster tweets
elminated_words_nd=nltk.corpus.stopwords.words('english') + ['http', 'co','https']
plt.figure(figsize = (15,8))
wc = WordCloud(max_words = 600 ,
               width = 400 , 
               height = 250, 
               stopwords = elminated_words_nd).generate(" ".join(train[train.target == 0].text))

plt.imshow(wc , interpolation = 'bilinear')

# **Exploratory Data Analysis**

> **Missing Value Check**

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

**Ignoring keyword and location features as they contain missing values**

# **Pre-processing**

# * **Cleaning the text**

In [None]:
#Initializing Lemmatizer here.
wn=nltk.WordNetLemmatizer()

stopwords=nltk.corpus.stopwords.words('english')
def clean_text(text):
    # Changing the word to lowercase and removing the punctuation from the text
    text="".join(word.lower() for word in text if word not in string.punctuation)
     
    text = re.sub('http','',text)
    #Ignoring special characters and tokenzing
    text=re.findall('\w+', text)
    
    #Removing the stopwords and lemmatizing the word i.e.changing the word into its meaningful base form
    text=[wn.lemmatize(word) for word in text if word not in stopwords]
    return text

# * **Setting the values**

In [None]:
# Ignoring keyword and location and taking only id and text in X_train and X_test
#y_train contains the target field
X_train=train[['id','text']]
y_train=train['target']
X_test=test[['id','text']]

In [None]:
print(y_train.shape)
print(X_train.shape)
print(X_test.shape)

# * **Intializing Vectorizer**

In [None]:
#Here, TfidfVectorizer is used, it coverts text to numeric data on the basis of how important
#thatword is in that text

vectorizer=TfidfVectorizer(analyzer=clean_text) #clean text function is used as an analyzer.
tfidf_vect_fit=vectorizer.fit(X_train['text']) #fitting the vectorizer on text of training data


In [None]:
tfidf_train = tfidf_vect_fit.transform(X_train['text']) #transforming the train vectorizer
tfidf_test = tfidf_vect_fit.transform(X_test['text'])  #transforming the text vectorizer

In [None]:
#Concatenating id and vectorized output of text for both train and test data
X_train_vect = pd.concat([X_train['id'].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test['id'].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

# **Modelling**

In [None]:
#Initializing Random forest classifier
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

#Fitting the model with training data
rf_model = rf.fit(X_train_vect, y_train)

#Predicting the target variable for test data
y_pred = rf_model.predict(X_test_vect)
y_pred

In [None]:
#Using Naive bayes model
gnb = GaussianNB()
gnb.fit(X_train_vect, y_train)

In [None]:
y_pred_nb=gnb.predict(X_test_vect)
y_pred_nb

In [None]:
#Using Gradient boosting model
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

#fitting the training set
gb_model = gb.fit(X_train_vect, y_train)

#Predicting the text to be related to disaster or not
y_pred_gb = gb_model.predict(X_test_vect)
y_pred_nb

**Save the data - Id and predicted target in a csv file for submission.
The model Random Forest Classifier gave higher score as compared to Naive Bayes and Gradient Boosting Classifier**

************************************END******************************************************