In [166]:
import re

import contractions
import emoji
import en_core_web_sm
import numpy as np
import pandas as pd
import unidecode
from gensim.models import KeyedVectors, Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn import svm, tree
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from spellchecker import SpellChecker

spell = SpellChecker()

nlp = en_core_web_sm.load()


In [2]:
deselect_stop_words = ['no', 'not']

for w in deselect_stop_words: nlp.vocab[w].is_stop = False

In [3]:
URL_REGEX = re.compile(r"(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?", re.UNICODE)
TAGS_REGEX = re.compile(r"(@[A-Za-z0-9]+)", re.UNICODE)

def cleanse_tags_mentions(text):
    return TAGS_REGEX.sub(r" ", text)

def cleanse_links(text):
    return URL_REGEX.sub(r' ',text)

def remove_accented_chars(text):
    # café -> cafe
    return unidecode.unidecode(text)

def expand_contractions(text):
    # don't -> do not
    return contractions.fix(text)

def remove_excess_whitespace(text):
    # '  ' -> ' '
    return " ".join(text.strip().split())

def extract_emojis(sentence):
    return [char for char in sentence if str(char.encode('unicode-escape'))[2] == '\\' ]

def char_is_emoji(character):
    return character in emoji.UNICODE_EMOJI

def spell_correct(text):
    text = text.split()
    wrong = spell.unknown(text)
    return ' '.join(spell.correction(word) if word in wrong and word.isalpha() and not char_is_emoji(word) else word for word in text)

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if not nlp.vocab[word].is_stop)

In [4]:
f1 = ['Final_Dataset_Facebook.csv', 'Final_Dataset_Facebook_Labels.csv']
f2 = ['Final_Dataset_Twitter.csv', 'Final_Dataset_Twitter_Labels.csv']

d11, d12 = pd.read_csv(f1[0]), pd.read_csv(f1[1])  # encoding="ISO-8859-1"
d21, d22 = pd.read_csv(f1[0]), pd.read_csv(f1[1])

df1 = pd.concat([d11, d12], axis=1)
df2 = pd.concat([d21, d22], axis=1)

df = pd.concat([df1, df2])

df.drop(columns=['Unnamed: 1'], inplace=True)

df.columns = ['tweet', 'label']

df

In [5]:
sanitized = []

for tweet in df['tweet']:
    tweet = cleanse_links(tweet)
    tweet = cleanse_tags_mentions(tweet)
    tweet = expand_contractions(tweet)
    tweet = spell_correct(tweet)
    tweet = remove_stop_words(tweet)
    tweet = remove_excess_whitespace(tweet)
    sanitized.append(tweet)

df['sanitized'] = sanitized

df

df.to_csv('cleaned_data.csv', index=False)

In [6]:
# df = pd.read_csv('cleaned_data.csv')
df.dropna(inplace=True)
df

Unnamed: 0,tweet,label,sanitized
0,Sorry No Comments ..I came here to read commen...,1,Sorry Comments ..I came read comments ..😜
1,Hi guys. I'm so happy and proud of myself and ...,1,Hi guys. happy proud thought share you!!! Toda...
2,Hahahaha your intelligence 😜😜😜,1,Hahahaha intelligence 😜😜😜
3,Aqsa Naveed we were proud backbenchers 😜😜😂,1,Aqsa Naveed proud backbenchers 😜😜😂
4,Hemant extraordinary sketcher😝..right Ujjawal??,1,Hemant extraordinary sketcher😝..right Ujjawal??
...,...,...,...
10937,fuck ps3 go get a xbox 360☺,0,fuck ps3 box 360☺
10938,Happy Bithday Windows 7.. I ♥ Windows 7,0,Happy Bithday Windows 7.. ♥ Windows 7
10939,I can't wait I'm soo excited! 😀😀😀,0,not wait soo excited! 😀😀😀
10940,"windows live is very good, I use it more than ...",0,"windows live good, use others. storage nice to..."


In [80]:
data = [word_tokenize(tweet) for tweet in df.sanitized.values]
emoji_list = [[emoji for emoji in extract_emojis(tweet) if char_is_emoji(emoji)] for tweet in df.sanitized.values]

model1 = Word2Vec(
    data, size=300, window=10, min_count=2, workers=10)
model1.train(data, total_examples=len(data), epochs=10)
X1 = [sum(model1[j] for j in i) for i in data]

model2 = KeyedVectors.load_word2vec_format(
		"emoji2vec.bin", binary=True)
X2 = []
for i in emoji_list:
	k = 0
	for j in i:
		try: k += model2[j]
		except: pass
	X2.append(k)
# X2 = [[model2[j] for j in i] for i in emoji_list]

X3 = np.array([x1 * x2 for x1, x2 in zip(X1, X2)])

In [169]:
X = X3
Y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

In [177]:
clf1 = SGDClassifier(max_iter=1000, random_state=0, tol=1e-3)
clf1.fit(X_train, y_train)
print(clf1.score(X_test, y_test))

0.9771271729185728


In [178]:
clf2 =  AdaBoostClassifier(n_estimators=100)
clf2.fit(X_train, y_train)
print(clf2.score(X_test, y_test))

0.9890210430009149


In [179]:
clf3 = LogisticRegression(multi_class='multinomial')
clf3.fit(X_train, y_train)
print(clf3.score(X_test, y_test))

0.9835315645013724


In [180]:
eclf1 = VotingClassifier(estimators=[('sgd', clf1), ('ada', clf2), ('lr', clf3)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
print(eclf1.score(X_test, y_test))

0.9839890210430009


In [None]:
# from sklearn.model_selection import ShuffleSplit
# n_samples = X.shape[0]
# cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
# cross_val_score(clf, X, y, cv=cv)

In [229]:
data = [word_tokenize(tweet) for tweet in df.sanitized.values]
idx = [i for i, tokens in enumerate(data) if len(tokens) > 4]
data = [tokens for tokens in data if len(tokens) > 4]
data1 = [i[:len(i) // 2] for i in data]
data2 = [i[len(i) // 2:] for i in data]

model1 = Word2Vec(
    data, size=300, window=10, min_count=2, workers=10)
model1.train(data1, total_examples=len(data1), epochs=10)
X1 = [sum(model1[j] for j in i) for i in data1]

model2 = Word2Vec(
    data2, size=300, window=10, min_count=2, workers=10)
model1.train(data2, total_examples=len(data2), epochs=10)
X2 = [sum(model2[j] for j in i) for i in data2]

In [237]:
preds = []

for X in [X1, X2]:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(X_train, y_train)
    preds.append(clf.predict(X_test))

print(preds)