In [1]:
import pandas as pd
import numpy as np
import copy
from tqdm import tqdm

In [2]:
print("reading data set....")
training_data_set = pd.read_csv("/Users/prajwalkrishn/Desktop/My_Computer/project - Dsci 601/Offensive_Tweet_Detection/Dataset/MOLID.csv")
print("Done reading....")


reading data set....
Done reading....


In [3]:
tweets = training_data_set[["tweet"]]
level_A_labels = training_data_set[["subtask_a"]]
level_B_labels = training_data_set.query("subtask_a == 'Offensive'")[["subtask_b"]]
level_C_labels = training_data_set.query("subtask_b == 'TIN'")[["subtask_c"]]

All_Cleaned_tweets = copy.deepcopy(tweets)

In [4]:
##Data Cleaning and Pre-Processing

In [5]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer,WordNetLemmatizer
lancaster = LancasterStemmer()
wordNet = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prajwalkrishn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prajwalkrishn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def remove_webTags_UserNames_Noise(tweet):
    things_to_be_removed_from_tweets = ['URL','@USER','\'ve','n\'t','\'s','\'m']
    
    for things in things_to_be_removed_from_tweets:
        tweet = tweet.replace(things,'')
    
    return re.sub(r'[^a-zA-Z]', ' ', tweet)

def tokenize(tweet):
    lower_cased_tweet = tweet.lower()
    return word_tokenize(lower_cased_tweet)

def stop_words_removal(tokens):
    cleaned_tokens = []
    stop = set(stopwords.words('english'))
    for token in tokens:
        if token not in stop:
            if token.replace(' ','') != '':
                if len(token) > 1:
                    cleaned_tokens.append(token)
    return cleaned_tokens

def stemming(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = lancaster.stem(token)
        if len(token) > 1:
            cleaned_tokens.append(token)
    return cleaned_tokens

def lemmatization(tokens):
    cleaned_tokens = []
    for token in tokens:
        token = wordNet.lemmatize(token)
        if len(token) > 1:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
tqdm.pandas(desc = "clean...")
All_Cleaned_tweets['tweet'] = tweets['tweet'].progress_apply(remove_webTags_UserNames_Noise)

tqdm.pandas(desc="Tokenize..")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tweet'].progress_apply(tokenize)

tqdm.pandas(desc="remove STOPWORDS...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(stop_words_removal)

tqdm.pandas(desc="Stemming...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(stemming)

tqdm.pandas(desc="Lemmatize...")
All_Cleaned_tweets['tokens'] = All_Cleaned_tweets['tokens'].progress_apply(lemmatization)

text_vector = All_Cleaned_tweets['tokens'].tolist()

  from pandas import Panel
clean...: 100%|██████████| 2499/2499 [00:00<00:00, 134651.80it/s]
Tokenize..: 100%|██████████| 2499/2499 [00:00<00:00, 10317.36it/s]
remove STOPWORDS...: 100%|██████████| 2499/2499 [00:00<00:00, 11886.92it/s]
Stemming...: 100%|██████████| 2499/2499 [00:00<00:00, 11870.32it/s]
Lemmatize...: 100%|██████████| 2499/2499 [00:00<00:00, 2566.42it/s]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfid(text_vector):
    vectorizer = TfidfVectorizer()
    untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
    vectorizer = vectorizer.fit(untokenized_data)
    vectors = vectorizer.transform(untokenized_data).toarray()
    return vectors
  
def get_vectors(vectors, labels, keyword):
    if len(vectors) != len(labels):
        print("Unmatching sizes!")
        return
    result = list()
    for vector, label in zip(vectors, labels):
        if label == keyword:
            result.append(vector)
    return result

In [9]:
vectors_level_a = tfid(text_vector) # Numerical Vectors A
labels_level_a = level_A_labels['subtask_a'].values.tolist() # Subtask A Labels

vectors_level_b = get_vectors(vectors_level_a, labels_level_a, "Offensive") # Numerical Vectors B
labels_level_b = level_B_labels['subtask_b'].values.tolist() # Subtask B Labels

vectors_level_c = get_vectors(vectors_level_b, labels_level_b, "TIN") # Numerical Vectors C
labels_level_c = level_C_labels['subtask_c'].values.tolist() # Subtask C Labels

Vectorizing...: 100%|██████████| 2499/2499 [00:00<00:00, 577146.95it/s]


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
# print("splitting and fitting on level A annotations....")
# train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors_level_a[:], labels_level_a[:], train_size=0.70)
# print("split done...")

train_vectors_b, test_vectors_b, train_labels_b, test_labels_b = train_test_split(vectors_level_c[:], labels_level_c[:], train_size=0.70)

print("fit begins...")
warnings.filterwarnings(action='ignore')
classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)
params = {'criterion':['gini','entropy']}
classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
classifier.fit(train_vectors_b, train_labels_b)
classifier = classifier.best_estimator_
print("fit complete....")

print("calculating accuracy....")
accuracy = accuracy_score(train_labels_b, classifier.predict(train_vectors_b))
print("Training Accuracy:", accuracy)
test_predictions = classifier.predict(test_vectors_b)
accuracy = accuracy_score(test_labels_b, test_predictions)
print("Test Accuracy:", accuracy)
print("Confusion Matrix:", )
print(confusion_matrix(test_labels_b, test_predictions))
print(classification_report(test_labels_b,test_predictions))

fit begins...
fit complete....
calculating accuracy....
Training Accuracy: 0.9536585365853658
Test Accuracy: 0.6497175141242938
Confusion Matrix:
[[ 10  25   3]
 [ 17 102   3]
 [  3  11   3]]
              precision    recall  f1-score   support

         GRP       0.33      0.26      0.29        38
         IND       0.74      0.84      0.78       122
         OTH       0.33      0.18      0.23        17

    accuracy                           0.65       177
   macro avg       0.47      0.43      0.44       177
weighted avg       0.61      0.65      0.63       177

