In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

#sklearn 
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# nlp preprocessing lib
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import string 
punctation = string.punctuation

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

### EDA

Let's Explore our data..

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(['id', 'keyword', 'location'], axis = 1)

In [None]:
train_df.shape

In [None]:
train_df.columns

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df[train_df["target"] == 1]["text"].values[0]

In [None]:
train_df[train_df["target"] == 1]["text"].values[1]

In [None]:
print("Number of duplicates in data : {}".format(len(train_df[train_df.duplicated()])))

In [None]:
print("Duplicated rows before remove them : ")
train_df[train_df.duplicated(keep=False)].sort_values(by="text").head(8)

In [None]:
#remove duplicated rows
train_df.drop_duplicates(inplace=True)

In [None]:
print("Number of duplicates in data : {}".format(len(train_df[train_df.duplicated()])))

In [None]:
train_df['target'].value_counts()

In [None]:
# count plot "Histogram" of Frequencies of Subjects for true news
plt.figure(figsize=(10,6))
plt.title("Frequencies of tweets for Disaster")
sns.countplot(x = 'target', data = train_df)
plt.xlabel('Disaster Type')

In [None]:
Real_Disaster_df = train_df[train_df['target'] == 1]
Real_Disaster_df.head()

In [None]:
Not_Real_Disaster_df = train_df[train_df['target'] == 0]
Not_Real_Disaster_df.head()

In [None]:
Real_Disaster_text = ' '.join(Real_Disaster_df.text.tolist())

In [None]:
wordcloud_true = WordCloud().generate(Real_Disaster_text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud_true)
plt.axis('off')
plt.title("Word Cloud of Real Disaster news")
plt.tight_layout(pad=0)
plt.show()

In [None]:
Not_Real_Disaster_text = ' '.join(Not_Real_Disaster_df.text.tolist())

In [None]:
wordcloud_true = WordCloud().generate(Not_Real_Disaster_text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud_true)
plt.axis('off')
plt.title("Word Cloud of Not RealDisaster twittes")
plt.tight_layout(pad=0)
plt.show()


### Text Preprocessing

In [None]:
# take text and preprocess 'remove stopwords [a, the, and, thus, ... etc] and punctations[,%$ ..etc] and len of text less than 3' 
def clean_text(text):
    """
        text: a string 
        return: cleaned string
    """
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and token not in punctation and  len(token) >= 3 :
            token = token.lower() 
            result.append(token)    
    return " ".join(result)

In [None]:
train_df['text'] = train_df['text'].map(clean_text)
train_df.head()

In [None]:
from sklearn.utils import shuffle
train_df_shuffled = shuffle(train_df)
train_df_shuffled.head()

In [None]:
X = train_df_shuffled['text']
y = train_df_shuffled['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)

In [None]:
X_test

In [None]:
from sklearn.model_selection import cross_val_score
nb_classifier = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),])

nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
print('accuracy {}'.format(accuracy_score(y_pred, y_test)))

In [None]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',alpha=1e-3, random_state=42, max_iter=1000, tol=None)),])


sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
print('accuracy {}'.format(accuracy_score(y_pred, y_test)))

In [None]:
test_df = test_df.drop(['id', 'keyword', 'location'], axis = 1)

In [None]:
test_df['text'] = test_df['text'].map(clean_text)
test_df.head()

In [None]:
y_pred = nb_classifier.predict(test_df['text'])

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [None]:
sample_submission["target"] = y_pred

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("submission.csv", index=False)

Now, in the viewer, you can submit the above file to the competition! Good luck!