___
<a href='http://www.kgptalkie.com'><img src='../kgptalkie_strips.png'/></a>
___

<center><em>Copyright by KGPTalkie</em></center>
<center><em>For free ML tutorials, visit us at <a href='http://www.kgptalkie.com'>www.kgptalkie.com</a> and <a href='http://www.youtube.com/kgptalkie'>www.youtube.com/kgptalkie</a> </em></center>

# Real or Not? Disaster Tweets Classifications

In [None]:
import pandas as pd
import numpy as np

import re
import string
import os

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

plt.style.use('ggplot')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from tensorflow.keras.optimizers import Adam

# Download Data

In [None]:
tweet = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-disaster-prediction-dataset/master/train.csv')

In [None]:
tweet.head()

In [None]:
tweet.shape

In [None]:
tweet.info()

# Exploratory Data Analysis

### Target Class Distribution

In [None]:
plt.rcParams['figure.figsize'] = [8,4]
plt.rcParams['figure.dpi'] = 80

In [None]:
sns.countplot('target', data=tweet)
plt.title('Real or Not Real Disaster Tweet')

In [None]:
tweet['target'].value_counts()

In [None]:
tweet['target'].value_counts().plot.pie(autopct='%1.2f%%')

### Number of Characters Distribution in Tweets

In [None]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

In [None]:
import preprocess_kgptalkie as kgp

In [None]:
tweet = kgp.get_basic_features(tweet)

In [None]:
tweet.head()

In [None]:
sns.distplot(tweet['char_counts'])

In [None]:
sns.kdeplot(tweet['char_counts'], shade=True)

In [None]:
sns.kdeplot(tweet[tweet['target']==1]['char_counts'], shade=True, color='red')
sns.kdeplot(tweet[tweet['target']==0]['char_counts'], shade=True, color='blue')

In [None]:
sns.catplot(y='char_counts', data=tweet, kind='violin', col='target')

### Number of Words, Average Words Length, and Stop words Distribution in Tweets

In [None]:
sns.kdeplot(tweet[tweet['target']==1]['word_counts'], shade=True, color='red')
sns.kdeplot(tweet[tweet['target']==0]['word_counts'], shade=True, color='magenta')


In [None]:
sns.kdeplot(tweet[tweet['target']==1]['avg_wordlength'], shade=True, color='red')
sns.kdeplot(tweet[tweet['target']==0]['avg_wordlength'], shade=True, color='magenta')


In [None]:
sns.kdeplot(tweet[tweet['target']==1]['stopwords_counts'], shade=True, color='red')
sns.kdeplot(tweet[tweet['target']==0]['stopwords_counts'], shade=True, color='magenta')


In [None]:
tweet.columns

### Most and Least Common Words

In [None]:
freqs = kgp.get_word_freqs(tweet, 'text')

In [None]:
top20 = freqs[100:120]

In [None]:
plt.bar(top20.index, top20.values)
plt.xticks(rotation=70)
plt.show()

In [None]:
least20 = freqs[-20:]
least20

In [None]:
bigram = kgp.get_ngram(tweet, 'text', ngram_range=2)

In [None]:
bigram[-20:]

### One-Shot Data Cleaning

In [None]:
def get_clean(x):
    x = str(x).lower().replace('\\', ' ').replace('_', ' ').replace('.', ' ')
    x = kgp.cont_exp(x)
    x = kgp.remove_emails(x)
    x = kgp.remove_urls(x)
    x = kgp.remove_html_tags(x)
    x = kgp.remove_rt(x)
    x = kgp.remove_accented_chars(x)
    x = kgp.remove_special_chars(x)
    x = kgp.remove_dups_char(x)
    return x

In [None]:
tweet['text'] = tweet['text'].apply(lambda x: get_clean(x))

In [None]:
tweet.head()['text']

In [None]:
# kgp.get_ngram(tweet, 'text', ngram_range=2)

### Disaster Words Visualization with Word Cloud



In [None]:
real = kgp.get_word_freqs(tweet[tweet['target']==1], 'text')
real = ' '.join(real.index)
real

In [None]:
word_cloud = WordCloud(max_font_size=100).generate(real)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

In [None]:
# not real plot

nreal = kgp.get_word_freqs(tweet[tweet['target']==0], 'text')
nreal = ' '.join(nreal.index)
word_cloud = WordCloud(max_font_size=100).generate(nreal)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()

## Classification with TFIDF and SVM


In [None]:
text = tweet['text']
y = tweet['target']

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(text)

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
def run_SVM(clf, X_train, X_test, y_train, y_test):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  print()
  print('Classification Report')
  print(classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf = LinearSVC()
run_SVM(clf, X_train, X_test, y_train, y_test)

## Classification with `Word2Vec` and SVM


In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import en_core_web_lg

In [None]:
nlp = en_core_web_lg.load()

In [None]:
x = 'cat dog'
doc = nlp(x)

In [None]:
doc.vector.shape

In [None]:
def get_vec(x):
  doc = nlp(x)
  vec = doc.vector
  return vec

In [None]:
tweet['vec'] = tweet['text'].apply(lambda x: get_vec(x))

In [None]:
tweet.head()

In [None]:
X = tweet['vec'].to_numpy()
X = X.reshape(-1, 1)

In [None]:
X.shape

In [None]:
X = np.concatenate(np.concatenate(X, axis=0), axis=0).reshape(-1, 300)

In [None]:
X.shape

In [None]:
y = tweet['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
%%time
clf = LinearSVC()
run_SVM(clf, X_train, X_test, y_train, y_test)

## Word Embeddings and Classification with Deep Learning


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 

In [None]:
text

In [None]:
token = Tokenizer()
token.fit_on_texts(text)

In [None]:
vocab_size = len(token.word_index) + 1
vocab_size

In [None]:
print(token.word_index)

In [None]:
encoded_text = token.texts_to_sequences(text)

In [None]:
print(encoded_text)

In [None]:
max_length = 40
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [None]:
print(X)

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
%%time
# clf = LinearSVC()
# run_SVM(clf, X_train, X_test, y_train, y_test)

In [None]:
vec_size = 100

model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length))

model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
%%time
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [None]:
def get_encoded(x):
  x = get_clean(x)
  x = token.texts_to_sequences([x])
  x = pad_sequences(x, maxlen=max_length, padding='post')
  return x

In [None]:
x = 'i am thrilled to see this'
vec = get_encoded(x)

In [None]:
vec

In [None]:
np.argmax(model.predict(vec), axis = -1)

# Other Resources
|  ML Course | Description |
|:---|:---|
| [**Data Visualization in Python Masterclass™: Beginners to Pro**](https://bit.ly/udemy95off_kgptalkie) |  Learn to build Machine Learning and Deep Learning models using Python and its libraries like Scikit-Learn, Keras, and TensorFlow. |
| [**Python for Machine Learning: A Step-by-Step Guide**](https://bit.ly/ml-ds-project) | Learn to build Machine Learning and Deep Learning models using Python and its libraries like Scikit-Learn, Keras, and TensorFlow. |
| [**Python for Linear Regression in Machine Learning**](https://bit.ly/regression-python) | Learn to build Linear Regression models using Python and its libraries like Scikit-Learn. |
| [**Introduction to Spacy 3 for Natural Language Processing**](https://bit.ly/spacy-intro) | Learn to build Natural Language Processing models using Python and its libraries like Spacy. |
| [**Advanced Machine Learning and Deep Learning Projects**](https://bit.ly/kgptalkie_ml_projects) | Learn to build Advanced Machine Learning and Deep Learning models using Python and transformer models like BERT, GPT-2, and XLNet. |
| [**Natural Language Processing in Python for Beginners**](https://bit.ly/intro_nlp) | Learn to build Natural Language Processing Projects using Spacy, NLTK, and Gensim, and transformer models like BERT, GPT-2, and XLNet. |
| [**Deployment of Machine Learning Models in Production in Python**](https://bit.ly/bert_nlp) |  Learn to deploy Machine Learning and Deep Learning models using Python and its libraries like Flask, Streamlit, and NGINX. |
| [**R 4.0 Programming for Data Science - Beginners to Pro**](https://bit.ly/r4-ml) | Learn to build Machine Learning and Deep Learning models using R and its libraries like caret, tidyverse, and keras. |
