## 1. Import and read data

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
print('Python {}'.format(sys.version))
print('Numpy {}'.format(np.__version__))
print('Panda {}'.format(pd.__version__))
print('NLTK {}'.format(nltk.__version__))
print('Seaborn {}'.format(sns.__version__))

### 1.1 Load Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [None]:
print(df_train.info())
print('===================================')
print(df_train.head())
print('===================================')

**Since the id, keyword and location are not important so to drop them**

In [None]:
df_train.drop(['id','keyword','location'], axis = 1, inplace=True)

### 1.2 Average Tweet Length

In [None]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [None]:
df_train['avg_word'] = df_train['text'].apply(lambda x: avg_word(x))

In [None]:
print('Disaster Tweets')
print('=================================')
Disaster = df_train[df_train.target==1]
Disaster.head()

In [None]:
print('Non-Disaster Tweets')
print('=================================')
Non_Disaster = df_train[df_train.target==0]
Non_Disaster.head()

### 1.3 Class Distribution

In [None]:
classes = df_train.loc[:,'target']
print(classes.value_counts())

## 2. Preprocessing

### 2.1 Lowercase

In [None]:
df_train['text'] = df_train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_train['text'].head()

### 2.2 Replace URLs

In [None]:
df_train['text'] = df_train['text'].str.replace(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ')
df_train['text'].head()

### 2.3 Special Characters Removal

In [None]:
df_train['text']= df_train['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')
df_train['text'].head()

### 2.4 Removal of Numbers

In [None]:
df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')
df_train['text'].head()

### 2.5 Removal punctuation, leading, trailing and in between whitespace

In [None]:
df_train['text'] = df_train['text'].str.replace(r'[^\w\d\s]',' ')
df_train['text'] = df_train['text'].str.replace(r'^\s+|\s+?$', '')
df_train['text'] = df_train['text'].str.replace(r'\s+',' ')
df_train['text'].head()

### 2.6 Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df_train['text'] = df_train['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
df_train['text'].head()

### 2.7 Stemming

In [None]:
from nltk.stem import PorterStemmer

st = PorterStemmer()
df_train['text']=df_train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df_train['text'].head()

### 2.8 Convert a Collection of Text Documents to a Matrix of Token Counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(df_train.text).toarray()
y = df_train.iloc[:, 1].values

In [None]:
print(X)
print('=============================')
print(y)

## 3. Modeling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

### 3.1 Define Models to Train

**Here all selected models are with their default parameters**

In [None]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

### 3.2 Comparing Accuracy of Different Models

In [None]:
for name, model in models:
    nltk_model = model
    nltk_model.fit(X_train,y_train)
    accuracy = nltk_model.score(X_test, y_test)*100
    print("{} Accuracy: {}".format(name, accuracy))

### 3.3 Selected Model

In [None]:
selected_classifier =  LogisticRegression()
selected_classifier.fit(X_train, y_train)

In [None]:
prediction = selected_classifier.predict(X_test)

### 3.4 Print Classification Report and Confusion Matrix

In [None]:
print(classification_report(y_test, prediction))

pd.DataFrame(
    confusion_matrix(y_test, prediction),
    index = [['actual', 'actual'], ['Non_Disaster', 'Disaster']],
    columns = [['predicted', 'predicted'], ['Non_Disaster', 'Disaster']])

## 4. Test Dataset

### 4.1 Preprocessing of Dataset

In [None]:
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print(df_test.head())

In [None]:
df_test.drop(['id','keyword','location'], axis = 1, inplace=True)

In [None]:
#---Lowercase-------
df_test['text'] = df_test['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#---Replace URLs----
df_train['text'] = df_test['text'].str.replace(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ')

#---Special Characters Removal---
df_test['text']= df_test['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')

#---Removal of Numbers------
df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')

#----Removal punctuation, leadign, trailing and in between whitespace----
df_test['text'] = df_test['text'].str.replace(r'[^\w\d\s]',' ')
df_test['text'] = df_test['text'].str.replace(r'^\s+|\s+?$', '')
df_test['text'] = df_test['text'].str.replace(r'\s+',' ')

#-----Removing Stopwords-------
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

#-----Stemming--------------------
df_test['text']=df_test['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df_test['text'].head()

In [None]:
df_test = cv.fit_transform(df_test.text).toarray()

In [None]:
final_predictions = selected_classifier.predict(df_test)

## 5. Verdict

In [None]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission.drop('target', axis = 1, inplace=True)

In [None]:
submission['target']=final_predictions
submission.to_csv('submission.csv',index = False)