# SIMPLE TEXT CLASSIFICATION STEPS
*****************

* **SUMMARY OF DATA**

    - Total Samples
    - Total Features
    - Check Null Values
    - Check the balance of the target classes
**********
* **CLEANING** 

    - Drop Duplicates
    - Drop Null Values
    - Resampling for imbalanced classes
***************
* **TEXT PREPROCESSING**

    * Removing irrelevant words such as @mentions or http links etc.
    * Remove Punctuations
    * Lowercase 
*****************
* **VECTORIZATION**

    * Convert text into numerical features using Tf-idf
*************
* **MODEL TRAINING & EVALUATION**

    * Creating pipeline of simple models
****************
* **CONCLUSION**

    * Choosing model with best F1 score
*******************
* **PREDICTION**

    * Predicting on test dataset
******************
****************

# LOADING DATA

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
train_df

# SUMMARY OF DATA

In [None]:
print("Total Number of Samples ", len(train_df))
print("\nTotal Number of Features ", len(train_df.columns))

### Check for null data in the "text" column

In [None]:
train_df['text'].isna().any()

In [None]:
df = train_df[['id','text','target']]

In [None]:
df.target.value_counts()

### Check whether the target class is balanced or imbalanced

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

sns.barplot(x=['Normal','Disaster'], y= df.target.value_counts().values)
plt.show()

There is a slight difference between the two classes, so we can say it is balanced. Hence would not require any kind of resampling techniques.

# CLEANING
******************

* Drop Duplicates
* Drop Null Values

### Dropping Duplicates

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

l = len(df)
df.drop_duplicates(subset='text', inplace=True)
print("Total Duplicates ", l - len(df))

In [None]:
len(df)

### Dropping NaN Values

***************

There are no NaN values in the df

In [None]:
null_rows = df['text'][df['text'].isna()]
null_rows

# TEXT PREPROCESSING
*************

* Removing irrelevant words such as @mentions or http links etc.
* Remove Punctuations
* Lowercase 

In [None]:
import re

In [None]:
# helper function
def clean_text(text):
    te = str(text).encode('ascii','ignore').decode('UTF-8')
    te = re.sub(r'@[\w]+', '', te)
    te = re.sub(r'https?://t.co/[\w]+', '', te)
    te = re.sub(r'#', '', te)
    te = re.sub(r"RT @[\w]+:",'',te)
    te = re.sub(r"RT @[\w]+:",'',te)
    te = re.sub(r" RT ",'',te)
    te = re.sub(r"https://[\w]+.[\w]+/[\w]+",'',te)
    te = re.sub(r"[][]",'',te)
    te = re.sub(r"&amp","and", te)
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", te)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# VECTORIZATION
*****************
* Convert the text into numerical features using Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

training_features = vectorizer.fit_transform(df.text)    

# MODEL TRAINING & EVALUATION

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score

In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('MultinomialNB', Pipeline([('MNB', MultinomialNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
%%time

# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], training_features, df.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], training_features, df.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], training_features, df.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], training_features, df.target, cv=kfold, scoring=a_scorer)
    
    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })

In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])

# CONCLUSION
********************

* We will choose the model that gives the best F1 score which is a combination of precision and recall.

In [None]:
# model with top f1 score

top_models_score = sorted(model_scores.items(), key=lambda k:k[1]['f1_score'], reverse=True)
top_models_score[0]

In [None]:
top_model = dict(est)[top_models_score[0][0]]
top_model.fit(training_features, df.target)

# PREDICTION

In [None]:
# load test csv
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
test_df.head()

In [None]:
test_features = vectorizer.transform(test_df.text)
predictions = top_model.predict(test_features)

In [None]:
submission = pd.DataFrame(columns=['id', 'target'])
submission['id'] = test_df['id']
submission['target'] = predictions

submission.to_csv('submission.csv', index=False)