In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import words

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

from nltk.stem import WordNetLemmatizer

In [None]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding="ISO-8859-1", low_memory=False)
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding="ISO-8859-1", low_memory=False) 
df = train.append(test, sort = False)

In [None]:
df.head(20)

In [None]:
df['Sentiment'].unique()

In [None]:
mapping = {'Neutral' : 0, 'Positive' : 1, 'Extremely Negative' : -1, 'Negative' : -1,'Extremely Positive' : 1}
df['label'] = df['Sentiment'].map(mapping)

In [None]:
df.head(20)

In [None]:
columns_to_keep = ['OriginalTweet','label']
df = df[columns_to_keep]

In [None]:
df.head(20)

In [None]:
df.dropna(inplace=True)

### Removing Urls

In [None]:
def url_cleaning(tweet):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'link', tweet)

df['OriginalTweet'] = df['OriginalTweet'].apply(url_cleaning)
display(df['OriginalTweet'].head(5))

In [None]:

def text_cleaning_1(tweet):
    tweet = re.sub(r" usa ", " America ", tweet)
    tweet = re.sub(r" USA ", " America ", tweet)
    tweet = re.sub(r" u s ", " America ", tweet)
    tweet = re.sub(r" uk ", " England ", tweet)
    tweet = re.sub(r" UK ", " England ", tweet)
    tweet = re.sub(r"USAgov", "USA government", tweet)
    tweet = re.sub(r"the US", "America", tweet)
    tweet = re.sub(r"Coronavirus", " covid ", tweet)
    tweet = re.sub(r"Covid19", " covid ", tweet)
    tweet = re.sub(r"\W", " ", tweet)
    tweet = re.sub(r"_", " ", tweet)
    return str(tweet)

In [None]:
df['OriginalTweet'] = df['OriginalTweet'].apply(text_cleaning_1)
display(df['OriginalTweet'].head(5))

In [None]:
df.head(20)

### Lower Case

In [None]:
df['OriginalTweet'] = df['OriginalTweet'].str.lower()

### Removing Stop words

In [None]:
def stop_word(tweet): 
    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(tweet) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    return ' '.join(filtered_sentence)

In [None]:
df['OriginalTweet'] = df['OriginalTweet'].apply(stop_word)

In [None]:
df['OriginalTweet'].head(20)

In [None]:
#spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    words = text.split()
    for word in words:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        elif word not in misspelled_words:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [None]:
#df['OriginalTweet'] = df['OriginalTweet'].apply(correct_spellings)

### Tokenize and Lemmatizer

In [None]:
df['OriginalTweet'] = df['OriginalTweet'].apply(word_tokenize)
lem = WordNetLemmatizer()
def lemma_wordnet(input):
    return [lem.lemmatize(w) for w in input]
df['OriginalTweet'] = df['OriginalTweet'].apply(lemma_wordnet)

In [None]:
def combine_word(tweet):
    return " ".join(tweet)
df['OriginalTweet'] = df['OriginalTweet'].apply(combine_word)

#### Split data into Train and Test data sets 

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df['OriginalTweet'], 
                                                    df['label'], 
                                                    random_state=0)

## Vectorization with CountVectorizer

In [None]:
vect = CountVectorizer(min_df=5, ngram_range=[1,4], analyzer='char_wb').fit(X_train)
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

## Vectorization with TFID vectorizer

In [None]:
vect = TfidfVectorizer(min_df=3, ngram_range=[1,4]).fit(X_train)
X_train_vect_TFID = vect.transform(X_train)
X_test_vect_TFID = vect.transform(X_test)

## Applying ML

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

#### MultinomialNB

In [None]:
print("MultinomialNB with CountVectorizer\n")
alpha = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 100.0]
for value in alpha:
    model = MultinomialNB(alpha = value).fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With alpha set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

In [None]:
print("MultinomialNB with Tfid Vectorizer\n")
alpha = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 100.0]
for value in alpha:
    model = MultinomialNB(alpha = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With alpha set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

####  Decision Tree Classifier

In [None]:
print("DecisionTreeClassifier with CountVectorizer\n")
depth = [3,6,9,12,15]
for value in depth:
    model = DecisionTreeClassifier(max_depth = value).fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With max_depth set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

In [None]:
print("DecisionTreeClassifier with CountVectorizer\n")
depth = [3,6,9,12,15]
for value in depth:
    model = DecisionTreeClassifier(max_depth = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With max_depth set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

### Logistic Regression

In [None]:
print("Logistic Regression with CountVectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = LogisticRegression(C = value,solver='lbfgs').fit(X_train_vect, y_train)
    y_predicted = model.predict(X_test_vect)
    score = multiclass_roc_auc_score(y_test, y_predicted)
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n")

In [None]:
print("Logistic Regression with Tfid Vectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = LogisticRegression(C = value,solver='lbfgs').fit(X_train_vect_TFID, y_train);
    y_predicted = model.predict(X_test_vect_TFID); 
    score = multiclass_roc_auc_score(y_test, y_predicted);
    acc_score = accuracy_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n");

#### Rigid

In [None]:
print("Rigid with Tfid Vectorizer\n")
C = [ 1, 5, 10, 20, 100, 1000]
for value in C:
    model = RidgeClassifier(alpha = value).fit(X_train_vect_TFID, y_train)
    y_predicted = model.predict(X_test_vect_TFID)
    score = accuracy_score(y_test, y_predicted)
    acc_score = multiclass_roc_auc_score(y_test, y_predicted)
    print(f"With C set to {value}, AUC score of model is {score} and Accuracy Score is {acc_score}\n");