In [67]:
import json, re
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

filenames = [
    'charliehebdo',
    'ferguson',
    'illary',
    'prince-toronto',
    'sydneysiege',
    'ebola-essien',
    'germanwings-crash',
    'ottawashooting',
    'putinmissing'
]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emiljoswin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
def load_data_from_file(name):
    filename = 'parsed_files/' + name + '.json'
    # print(filename)

    try:
        with open(filename) as f:
            tweets = json.load(f)
    except Exception as e:
        print("Exception")
        print(e.message)

    return tweets

In [69]:
repliesText = []
reply_labels = []
corpus = []

def prepare_data(filenames):
    for name in filenames:
        t = load_data_from_file(name)

        for i in range(len(t[name])):
            thread = t[name][i]
            reply_list = thread['replies']
            for reply in reply_list:
                text = reply["text"]
                label = reply["label"]
                repliesText.append(text)
                reply_labels.append(label)
        # print (reply_labels)

    # Data Cleaning
    ps = PorterStemmer()
    for oneReply in repliesText:
        cleanText = re.sub('@(\S)+', ' ', oneReply)
        cleanText = re.sub('http(\S)+', ' ', cleanText)
        cleanText = re.sub('[^a-zA-Z]', ' ', cleanText) #TODO - What is this? Check with individual tweets
        # print(cleanText)
        cleanText = cleanText.lower()

        splitCleanText = cleanText.split()
        splitCleanText = [word for word in splitCleanText if not word in set(stopwords.words('english'))] # TODO

        str = ""
        for oneWord in splitCleanText:
            oneWord = ps.stem(oneWord) # TODO - stemming is a too aggressive here
            str += (' '+ oneWord)
            # str = ' '.join(oneWord)

        corpus.append(str)
    #print(corpus)

prepare_data(filenames)

  cleanText = re.sub('@(\S)+', ' ', oneReply)
  cleanText = re.sub('http(\S)+', ' ', cleanText)


In [70]:
# print(corpus)


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

cv = TfidfVectorizer(analyzer='word', ngram_range=(1,4), min_df = 0, stop_words = 'english', max_features= 500)
# TODO - this uses English stopwords already
# TODO - ngram_range (1, 3) 1, 2 and 3 grams are used.
# TODO - min_df => Ignore terms that have document frequency lower that 0.
# TODO - max_features =>  build a vocabulary that only consider the top max_features 
#                         ordered by term frequency across the corpus.

X = cv.fit_transform(corpus).toarray()
# TODO - By this point the tf-idf vectorization is already complete. Shouldn't we do this after SMOTE?
y = reply_labels



In [71]:
# from collections import Counter
# from imblearn.over_sampling import SMOTE # NOTE conda install -c glemaitre imbalanced-learn

# # TODO - do not do SMOTE on the test/validation sets
# print("Before SMOTE")
# print(sorted(Counter(y).items()))

# from collections import Counter
# from imblearn.over_sampling import SMOTE
# smote_enn = SMOTE(random_state=0)
# X_resampled, y_resampled = smote_enn.fit_sample(X, y) # TODO - fit_resample(X,y) was not working.

# print("AFTER SMOTE")
# print(sorted(Counter(y_resampled).items()))

In [78]:

# step : cross-validation
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.40, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=0)

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
smote_enn = SMOTE(random_state=2)
X_smote, y_smote = smote_enn.fit_sample(X_train, y_train)


#step: Classifier
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_smote, y_smote)
# clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)



from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predict)
print(cm)
#print(len(reply_labels)
recall = np.diag(cm) / np.sum(cm, axis=1)
precision = np.diag(cm)/ np.sum(cm, axis=0)

f1_score = 2* (precision*recall)/(precision + recall + 1)
acc = accuracy_score(y_test, y_predict)
print('precision = ',precision)
print('recall = ', recall)
print('f1 score = ',f1_score)
print('RF accuracy = ',acc)

print("\n")

[[905 148 177 168]
 [ 87  35  26  23]
 [105  14  37  17]
 [149  24  27  77]]
precision =  [0.72632424 0.15837104 0.13857678 0.27017544]
recall =  [0.64735336 0.20467836 0.21387283 0.27797834]
f1 score =  [0.39616874 0.04756266 0.04382834 0.09702256]
RF accuracy =  0.5220406141654285




In [73]:
print(X.shape)

(5046, 500)
