In [None]:
!pip install preprocessor
!pip install tweet-preprocessor

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Data Cleaning
import nltk
from nltk.tokenize import TweetTokenizer

import preprocessor as tweet_preprocess
import re
import string

def download_necessary_functions():
    global stopwords, wn, wcorp, tweet_tokenizer
    stopwords = nltk.corpus.stopwords.words('english')
    wn = nltk.WordNetLemmatizer()
    wcorp = set(nltk.corpus.words.words())
    tweet_preprocess.set_options(
        tweet_preprocess.OPT.URL,
        tweet_preprocess.OPT.MENTION,
        tweet_preprocess.OPT.NUMBER)
    tweet_tokenizer = TweetTokenizer()

def hashtag_segmentation(word):
    if(word.startswith('#')):
        word = word.replace('#', '')
    word = re.sub('([A-Z][a-z]+)', r' \1',
                  re.sub('([A-Z]+)', r' \1', word)).split()
    return word

def remove_number(word):
    res = any(map(str.isdigit, word))
    if res:
        return ''
    else:
        return word

def lemmatize_english_words(word):
    if word in wcorp:
        return wn.lemmatize(word)
    else:
        return word

def remove_short_words(word):
    if len(word) > 3:
        return word
    
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    if(regex.search(word) != None):
        return word
    return ''

def split_by_dot(word):
    return word.split(".")

def clean_text(text):
    text = tweet_preprocess.clean(text)
    tokens = np.array(tweet_tokenizer.tokenize(text))
    tokens = [hashtag_segmentation(word) for word in tokens]
    tokens = np.hstack(np.array(tokens, dtype=object))
    tokens = [split_by_dot(word) for word in tokens]
    tokens = np.hstack(np.array(tokens, dtype=object))
    tokens = [remove_number(word) for word in tokens]
    tokens = [lemmatize_english_words(word) for word in tokens]
    tokens = [remove_short_words(word) for word in tokens]
    tokens = [word for word in tokens if word not in stopwords]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if not(word.isdigit())]
    tokens = list(filter(None, tokens))
    return list(map(str.lower, tokens))

In [None]:
# Training & Testing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, multilabel_confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.multiclass import OneVsRestClassifier
import scikitplot as skplt

def set_train_test(xtrain, xtest, ytrain, ytest):
    global X_train, X_test, y_train, y_test
    X_train = xtrain
    X_test = xtest
    y_train = ytrain
    y_test = ytest


def train_test_model(classifier):
    global y_pred
    k_fold = KFold(n_splits=5)
    cls = classifier()
    print(cross_val_score(cls, X_train, y_train,
                          cv=k_fold, scoring='accuracy', n_jobs=-1))

    model = cls.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model


def get_model_performance():
    accuracy = round((y_pred == y_test).sum()/len(y_pred), 3)
    cf_matrix = confusion_matrix(y_test, y_pred)
    precision = round(precision_score(
        y_test, y_pred, pos_label=0, average='binary'), 3)
    recall = round(recall_score(
        y_test, y_pred, pos_label=0, average='binary'), 3)
    return accuracy, cf_matrix, precision, recall

In [None]:
# Converts tokens into vectors
def convert_to_tfidf_vector(df):
    tfidf_vec = TfidfVectorizer(analyzer=clean_text)
    xtfidf_fit = tfidf_vec.fit(df['merge'])
    xtfidf = xtfidf_fit.fit_transform(df['merge'])
    X = pd.DataFrame(xtfidf.toarray())
    return X

In [None]:
# Load Dataset
train = pd.read_csv('../input/nlp-getting-started/train.csv', 
                   encoding='latin-1')
test = pd.read_csv('../input/nlp-getting-started/test.csv', 
                   encoding='latin-1')

# Drop unnecessary columns/features
train.drop (columns = ['keyword'], inplace = True)

# Drop unnecessary columns/features
test.drop (columns = ['keyword'], inplace = True)

data = train.append(test, ignore_index=True)

In [None]:
# Feature Engineering using TF-IDF
download_necessary_functions()
data['merge'] = data['text'] 
X_data = convert_to_tfidf_vector(data)
data_vec = pd.DataFrame(X_data)
data_vec['target'] = data['target'] 


In [None]:
# Separating training data
train_vec = data_vec
train_vec = train_vec[train_vec['target'].notna()]
X_train_vec = train_vec.copy()
X_train_vec.drop (columns = ['target'], inplace = True)
X_train_vec

In [None]:
# Divide training data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_train_vec, train_vec['target'], test_size=0.2)

# Handle Data imbalance using SMOTE Tomek
from imblearn.combine import SMOTETomek
SMOTE = SMOTETomek(random_state=139)
X_train, y_train = SMOTE.fit_resample(X_train, y_train)
X_test, y_test = SMOTE.fit_resample(X_test, y_test)

In [None]:
# LOGISTIC REGRESSION
# Model Building, Cross Validation and Prediction
set_train_test(X_train, X_test, y_train, y_test)
model = train_test_model(lambda: LogisticRegression(solver='lbfgs',max_iter=500))

In [None]:
# Check Model Performance
accuracy, cf_matrix, precision, recall = get_model_performance()
print('Accuracy: {} Precision: {} Recall: {}'.format(accuracy, precision, recall))

In [None]:
# Predict data based on test data
test_vec = data_vec
test_vec = test_vec[test_vec['target'].isna()]
X_test_vec = test_vec.copy()
X_test_vec.drop (columns = ['target'], inplace = True)
X_test_vec

y_test_pred = model.predict(X_test_vec)
tmp = data[data['target'].isna()]
test_res = pd.DataFrame(tmp['id'], columns=['id'])
test_res['target'] = y_test_pred
test_res.to_csv(r'./Submission.csv')

In [None]:
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

def IsDataBalance(ydata, title):
    arr = []
    arr.append([len(ydata[ydata['target'] == 0]), 0])
    arr.append([len(ydata[ydata['target'] == 1]), 1])
    df = pd.DataFrame(arr, columns=['count', 'target'])
    
    sns.barplot(x = 'target',
                y = 'count',
                data = df, 
                palette='YlGn').set(title=title)
    plt.show()
    
def make_confusion_matrix(cf_matrix):
    labels = ['True Negative','False Positive','False Negative','True Positive']
    categories = ['0', '1']
    group_names = labels
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                        cf_matrix.flatten()/np.sum(cf_matrix)]
    categories = categories
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
            zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='YlGn') 
    ax.set_xticklabels(categories) 
    ax.set_yticklabels(categories)
    ax.set(ylabel="True", xlabel="Predicted")
    ax.set(title="Consfusion Matrix")
    
make_confusion_matrix(cf_matrix)


In [None]:
ydata = pd.DataFrame(y_train, columns=['target'])
IsDataBalance(ydata, "Training Data")

ydata = pd.DataFrame(y_test, columns=['target'])
IsDataBalance(ydata, "Testing Data")