## Natural Language Processing with Disaster Tweets

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t.

**Please upvote and share if this helps you!! Also, feel free to fork this kernel to play around with the code and test it for yourself.**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
train.isnull().sum()

In [None]:
train["length"] = train["text"].apply(len)

In [None]:
sns.countplot(x = "target",data = train,palette="icefire")
plt.title('Label Counts')
plt.show()

In [None]:
sns.barplot(x = "target", y = "length", data = train, palette="icefire")
plt.title("Avg. length of each target")
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
sns.histplot(train[train["target"] == 1]["length"],bins = 30,ax = ax1, kde=True).set(title = "disaster tweets")
sns.histplot(train[train["target"] == 0]["length"],bins = 30,ax = ax2, kde = True).set(title = "Not disaster tweets")
plt.show()

**The distribution of both seems to be almost same.120 to 140 characters in a tweet are the most common among both.**

In [None]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()
def process_text(text):
    text = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])", " ",text.lower())
    words = nltk.word_tokenize(text)
    words = [lemma.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    text = " ".join(words)
        
    return text

train["text"] = train["text"].apply(process_text)

In [None]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()
def process_text(text):
    text = re.sub("(@[A-Za-z0-9_]+)|([^0-9A-Za-z \t])", " ",text.lower())
    words = nltk.word_tokenize(text)
    words = [lemma.lemmatize(word) for word in words if word not in set(stopwords.words("english"))]
    text = " ".join(words)
        
    return text

test["text"] = test["text"].apply(process_text)

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
text = " ".join(review for review in train.text)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="black").generate(text)
fig = plt.figure(figsize = (10, 10)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("To Create Cloud of words for all words in train data")
plt.show()

In [None]:
text = " ".join(review for review in test.text)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="black").generate(text)
fig = plt.figure(figsize = (10, 10)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("To Create Cloud of words for all words in test data")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X, y = train['text'], train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

clf_A = LogisticRegression()
clf_B = AdaBoostClassifier()
clf_C = DecisionTreeClassifier()
clf_D = SVC()
clf_E = RandomForestClassifier()
clf_F = MultinomialNB()
clfs = [clf_A,clf_B,clf_C,clf_D,clf_E,clf_F]

In [None]:
df_score = pd.DataFrame(index=None, columns=['model','f1_score'])
for clf in clfs:
    clf.fit(X_train_vec, y_train)
    pred = clf.predict(X_test_vec)
    score3 = f1_score(y_test,pred)
  
    df_score = df_score.append(pd.Series({
                "model" : clf.__class__.__name__,
                "f1_score" : score3}),ignore_index = True)

df_score

In [None]:
test_vec = vectorizer.transform(test['text']).toarray()
predictions = clf.predict(test_vec)

In [None]:
submission = pd.DataFrame(predictions, columns=['target'])
submission['id'] = test['id']
submission.set_index('id', inplace=True)

submission.to_csv('submission.csv')