## Import libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
import seaborn as sns 
import re


In [None]:
pd.set_option('display.max_colwidth', 150)

## Import dataset

In [None]:
dataset = pd.read_csv('../input/nlp-getting-started/train.csv')
dataset_test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
dataset

## VIsualization

In [None]:

plt.figure(figsize=(8,5))
sns.countplot(x="target", data=dataset , palette="dark", linewidth=5)
plt.show()

In [None]:
piedata = dataset['target']
plt.figure(figsize=(6,6))
piedata.value_counts().plot(kind = 'pie',autopct = '%.2f%%')

In [None]:
sns.countplot(y = dataset.keyword,order = dataset['keyword'].value_counts().sort_values(ascending=False).iloc[0:20].index)
plt.title("Count of Keywords")


In [None]:
disastered_tweet = dataset.groupby('keyword')['target'].mean().sort_values(ascending=False).head(15)
non_disasterd_tweet  = dataset.groupby('keyword')['target'].mean().sort_values().head(15)

plt.figure(figsize=(7,4))
sns.barplot(disastered_tweet, disastered_tweet.index, color='red')
plt.title('Keywords with highest % of disaster tweets')
sns.barplot(non_disasterd_tweet, non_disasterd_tweet.index, color='blue')
plt.title('Keywords with lowest % of disaster tweets')

plt.show()

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(y = dataset.location, order = dataset['location'].value_counts().sort_values(ascending=False).iloc[0:15].index)

In [None]:
raw_loc = dataset.location.value_counts()
top_loc_disaster = list(raw_loc[raw_loc>=10].index)
top_only_disaster = dataset[dataset.location.isin(top_loc_disaster)]

top_location = top_only_disaster.groupby('location')['target'].mean().sort_values(ascending=False)
sns.barplot(x=top_location.index, y=top_location)
plt.xticks(rotation=90)
plt.show()

In [None]:
dataset.keyword.fillna('None', inplace=True) 
dataset.location.fillna('None' , inplace = True )

In [None]:
dataset.isnull().sum()

## Data_Cleaning 

In [None]:
def decontraction(phrase):
    
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase =phrase.lower()
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub('\[.*?\]', ' ', phrase) 
    phrase = re.sub('https?://\S+|www\.\S+', ' ', phrase)
    phrase = re.sub('<.*?>+', ' ', phrase)
    phrase = re.sub('\n', ' ', phrase)
    phrase = re.sub('\w*\d\w*', ' ', phrase)
    return phrase

dataset.text = [decontraction(tweet) for tweet in dataset.text]

In [None]:

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
corpus = []
for i in range(len(dataset.text)):
    review = re.sub('[^a-zA-Z]' ,' ', dataset['text'][i])
    review =review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review  if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
print(corpus)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = TfidfVectorizer()
av = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [None]:
X

In [None]:
len(X[0])

In [None]:
Y = dataset.iloc[:,-1].values


In [None]:
Y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test , Y_train, Y_test = train_test_split(X,Y , test_size = 0.2, random_state = 42)

In [None]:
X_train.shape , X_test.shape , Y_train.shape , Y_test.shape

## NAives_Bayes 

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)

In [None]:
Y_pred =classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix , accuracy_score
cm = confusion_matrix(Y_test , Y_pred)
sns.heatmap(cm)
accuracy_score(Y_test , Y_pred)

In [None]:
Y_pred_train = classifier.predict(X_train)
cm = confusion_matrix(Y_train , Y_pred_train)
sns.heatmap(cm)
accuracy_score(Y_train , Y_pred_train)

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
voc_size = 10000
onehot_rep = [ one_hot(words, voc_size) for words in corpus]

In [None]:
onehot_rep

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
sent_length = 20
embedded_does= pad_sequences(onehot_rep , padding = 'pre', maxlen =sent_length)

In [None]:
embedded_does

In [None]:
dim = 10
model = Sequential()
model.add(Embedding(voc_size  ,10 , input_length = sent_length ))
model.compile('adam' , 'mse')
model.summary()

In [None]:
print(model.predict(embedded_does))

In [None]:
embedded_does[0]

In [None]:
print(model.predict(embedded_does[0]))

In [None]:
X = np.array(embedded_does)

In [None]:
X.shape , Y.shape

In [None]:
X_train, X_test , Y_train, Y_test = train_test_split(X,Y , test_size = 0.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
classifier.fit(X_train , Y_train)

In [None]:
Y_pred_train = classifier.predict(X_train)
Y_pred_1 = classifier.predict(X_test)

In [None]:
Y_pred_1

In [None]:
accuracy_score(Y_pred_1 , Y_test)

## XGBoost

In [None]:
from xgboost import XGBClassifier
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train , Y_train)

In [None]:
Y_pred_xgb = classifier_xgb.predict(X_test)

In [None]:
Y_pred_xgb

In [None]:
accuracy_score(Y_pred_xgb , Y_test)

## For Test dataset

In [None]:
dataset_test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
dataset_test

In [None]:
dataset_test.text = [decontraction(tweet) for tweet in dataset_test.text]

In [None]:
def decontraction(phrase):
    
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = phrase.lower()
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub('\[.*?\]', ' ', phrase) 
    phrase = re.sub('https?://\S+|www\.\S+', ' ', phrase)
    phrase = re.sub('<.*?>+', ' ', phrase)
    phrase = re.sub('\n', ' ', phrase)
    phrase = re.sub('\w*\d\w*', ' ', phrase)
    return phrase

dataset_test.text = [decontraction(tweet) for tweet in dataset_test.text]

In [None]:
corpus_test = []
for i in range(len(dataset_test.text)):
    
    review = re.sub('[^a-zA-Z]' ,' ', dataset_test['text'][i])
    review =review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review  if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus_test.append(review)
print(corpus_test)

In [None]:
onehot_rep = [ one_hot(words, voc_size) for words in corpus_test]

In [None]:
embedded_does_test= pad_sequences(onehot_rep , padding = 'pre', maxlen =sent_length)

In [None]:
embedded_does_test

In [None]:
X_test_dataset = np.array(embedded_does_test)

In [None]:
dim = 10
model = Sequential()
model.add(Embedding(voc_size  ,10 , input_length = sent_length ))
model.compile('adam' , 'mse')
model.summary()

In [None]:
Y_pred_test_data = classifier_xgb.predict(X_test_dataset)

In [None]:
Y_pred_test_data

In [None]:
submission_file_test = pd.DataFrame({'Id':dataset_test['id'],'target':Y_pred_test_data})
submission_file_test.to_csv('submission_file.csv',index=False)
submission_file_test = pd.read_csv('submission_file.csv')
submission_file_test.head(10)