In [2]:
import re
import pandas as pd
import pickle
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn import neighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
train_data = pd.read_csv('offenseval-training-v1.tsv', delimiter='\t')

In [3]:
train_data.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [5]:
tweets = train_data[["tweet"]]
subtask_a_labels = train_data[["subtask_a"]]
subtask_b_labels = train_data.query("subtask_a == 'OFF'")[["subtask_b"]]
subtask_c_labels = train_data.query("subtask_b == 'TIN'")[["subtask_c"]]

clean_tweets = copy.deepcopy(tweets)

# Preprocessing tweets of our training data

In [6]:
import re
import nltk
nltk.download('punkt', 'stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to stopwords...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [7]:

def take_data_to_shower(tweet):
    noises = ['URL', '@USER', '\'ve', 'n\'t', '\'s', '\'m']

    for noise in noises:
        tweet = tweet.replace(noise, '')

    return re.sub(r'[^a-zA-Z]', ' ', tweet)


def tokenize(tweet):
    lower_tweet = tweet.lower()
    return word_tokenize(lower_tweet)


def remove_stop_words(tokens):
    clean_tokens = []
    stopWords = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopWords:
            if token.replace(' ', '') != '':
                if len(token) > 1:
                    clean_tokens.append(token)
    return clean_tokens


def stem_and_lem(tokens):
    clean_tokens = []
    for token in tokens:
        token = wordnet_lemmatizer.lemmatize(token)
        token = lancaster_stemmer.stem(token)
        if len(token) > 1:
            clean_tokens.append(token)
    return clean_tokens

In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srini\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\srini\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:

tqdm.pandas(desc="Cleaning Data Phase I...")
clean_tweets['tweet'] = tweets['tweet'].progress_apply(take_data_to_shower)

tqdm.pandas(desc="Tokenizing Data...")
clean_tweets['tokens'] = clean_tweets['tweet'].progress_apply(tokenize)

tqdm.pandas(desc="Cleaning Data Phase II...")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(remove_stop_words)

tqdm.pandas(desc="Stemming And Lemmatizing")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(stem_and_lem)

text_vector = clean_tweets['tokens'].tolist()

Cleaning Data Phase I...: 100%|██████████| 13240/13240 [00:00<00:00, 93863.07it/s]
Tokenizing Data...: 100%|██████████| 13240/13240 [00:01<00:00, 7967.89it/s]
Cleaning Data Phase II...: 100%|██████████| 13240/13240 [00:03<00:00, 4393.81it/s]
Stemming And Lemmatizing: 100%|██████████| 13240/13240 [00:04<00:00, 2923.31it/s]


In [26]:
text_vector

[['ask', 'nat', 'am', 'tak'],
 ['go', 'hom', 'drunk', 'mag', 'trump'],
 ['amazon',
  'investig',
  'chines',
  'employ',
  'sel',
  'intern',
  'dat',
  'third',
  'party',
  'sel',
  'look',
  'edg',
  'competit',
  'marketplac',
  'amazon',
  'mag',
  'kag',
  'chin',
  'tcot'],
 ['someon', 'shouldtak', 'piec', 'shit', 'volcano'],
 ['obam', 'want', 'lib', 'amp', 'illeg', 'mov', 'red', 'stat'],
 ['lib', 'kookoo'],
 ['oh', 'no', 'tough', 'shit'],
 ['lit',
  'talk',
  'lol',
  'mass',
  'shoot',
  'lik',
  'set',
  'up',
  'propagand',
  'us',
  'divid',
  'maj',
  'issu',
  'lik',
  'gun',
  'control',
  'ter'],
 ['buy', 'icecream'],
 ['canad',
  'nee',
  'anoth',
  'cuck',
  'already',
  'enough',
  'looneyleft',
  'lib',
  'king',
  'gre',
  'country',
  'qproofs',
  'trudeaumustgo'],
 ['fault', 'support', 'gun', 'control'],
 ['diff',
  'kavanaugh',
  'on',
  'men',
  'admit',
  'grop',
  'year',
  'old',
  'girl',
  'year',
  'ago',
  'going',
  'confirm',
  'scj',
  'demsarefraud',

# Feature Extraction  ||  Using Tfidf as Feautes .

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfid(text_vector):
    vectorizer = TfidfVectorizer()
    untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
    vectorizer = vectorizer.fit(untokenized_data)
    vectors = vectorizer.transform(untokenized_data).toarray()
    return vectors
  
def get_vectors(vectors, labels, keyword):
    if len(vectors) != len(labels):
        print("Unmatching sizes!")
        return
    result = list()
    for vector, label in zip(vectors, labels):
        if label == keyword:
            result.append(vector)
    return result

In [11]:

vectors_a = tfid(text_vector) # Numerical Vectors A
labels_a = subtask_a_labels['subtask_a'].values.tolist() # Subtask A Labels


Vectorizing...: 100%|██████████| 13240/13240 [00:00<00:00, 1016596.22it/s]


In [68]:
print (len(vectors_a))
print(len(labels_a))

print(type(vectors_a))
print(vectors_a)


print (len(labels_a))
print(len(labels_a))

print(type(labels_a))
print(labels_a)

13240
13240
<class 'numpy.ndarray'>
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
13240
13240
<class 'list'>
['OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'OFF', 'OFF', 'NOT', 'OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'OFF', 'OFF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'NOT', 'OFF', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'OFF', 'OFF', 'NOT', 'NOT', 'NOT', 'OF

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

def classify(vectors, labels, type="DT"):
    # Random Splitting With Ratio 3 : 1
    train_vectors, test_vectors, train_labels, test_labels = train_test_split(vectors, labels, test_size=0.25)

    # Initialize Model
    classifier = None
    if(type=="MNB"):
        classifier = MultinomialNB(alpha=0.7)
        classifier.fit(train_vectors, train_labels)
    elif(type=="KNN"):
        classifier = KNeighborsClassifier(n_jobs=4)
        params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="SVM"):
        classifier = SVC()
        classifier = GridSearchCV(classifier, {'C':[0.001, 0.01, 0.1, 1, 10]}, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="DT"):
        classifier = DecisionTreeClassifier(max_depth=800, min_samples_split=5)
        params = {'criterion':['gini','entropy']}
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="RF"):
        classifier = RandomForestClassifier(max_depth=800, min_samples_split=5)
        params = {'n_estimators': [n for n in range(50,200,50)], 'criterion':['gini','entropy'], }
        classifier = GridSearchCV(classifier, params, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    elif(type=="LR"):
        classifier = LogisticRegression(multi_class='auto', solver='newton-cg',)
        classifier = GridSearchCV(classifier, {"C":np.logspace(-3,3,7), "penalty":["l2"]}, cv=3, n_jobs=4)
        classifier.fit(train_vectors, train_labels)
        classifier = classifier.best_estimator_
    else:
        print("Wrong Classifier Type!")
        return

    accuracy = accuracy_score(train_labels, classifier.predict(train_vectors))
    print("Training Accuracy:", accuracy)
    test_predictions = classifier.predict(test_vectors)
    accuracy = accuracy_score(test_labels, test_predictions)
    print("Test Accuracy:", accuracy)
    print("Confusion Matrix:", )
    print(confusion_matrix(test_labels, test_predictions))

In [33]:
# print(type(abc))
# print(type(labels_a))
# abc.shape

# Model Fitting 

In [37]:

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# SVM Classifier 

In [13]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[1000:2000], labels_a[1000:2000], "SVM") # {MNB, KNN, SVM, DT, RF, LR}


Building Model Subtask A...
Training Accuracy: 0.66
Test Accuracy: 0.664
Confusion Matrix:
[[166   0]
 [ 84   0]]


# Multinomial Naive Baiyes

In [14]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[1000:2000], labels_a[1000:2000], "MNB") 


Building Model Subtask A...
Training Accuracy: 0.8106666666666666
Test Accuracy: 0.716
Confusion Matrix:
[[172   1]
 [ 70   7]]


# Logistic Regression Classifier 

In [16]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[1000:2000], labels_a[1000:2000], "LR") 


Building Model Subtask A...
Training Accuracy: 0.9986666666666667
Test Accuracy: 0.696
Confusion Matrix:
[[139  20]
 [ 56  35]]


# Decision Tree 

In [17]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[1000:2000], labels_a[1000:2000], "DT") 


Building Model Subtask A...
Training Accuracy: 0.992
Test Accuracy: 0.688
Confusion Matrix:
[[137  30]
 [ 48  35]]


# Random Forest

In [18]:
print("\nBuilding Model Subtask A...")
classify(vectors_a[1000:2000], labels_a[1000:2000], "RF") 


Building Model Subtask A...
Training Accuracy: 1.0
Test Accuracy: 0.732
Confusion Matrix:
[[159   3]
 [ 64  24]]
