In [2]:
import csv
import math
import re
from collections import Counter

import nltk
import numpy as np
import textblob
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


def get_feature_engineered_data():
    inp_filename = "./MigrationTweets.csv"
    out_filename = "./MigrationData.csv"

    keyword_list = ["refugeecrisis", "refugee", "refugeeswelcome", "referendum", "eu",
                    "syrian", "syria", "UN", "syrian refugee", "migrant", "attack", "europe", "children",
                    "state", "voters", "refugeesgr", "found", "invalid", "withrefugees", "germany",
                    "isis", "accept", "nearly", "migrant", "immigr"]
    with \
            open(inp_filename,
                 'r+',
                 encoding="utf-8") as inp, \
            open(out_filename,
                 'w+',
                 encoding="utf-8") as out:
        reader = csv.DictReader(inp)
        my_fields = ['Text',
                     'Final_Migration_label',
                     'Similarity']

        writer = csv.DictWriter(out, fieldnames=my_fields)
        writer.writeheader()

        for row in reader:
            # append 1st feature
            sim = get_avg_cosine_similarity(row['tweet_text'], keyword_list)
            writer.writerow({'Text': row['tweet_text'],
                             'Final_Migration_label': row['migration_relevance'],
                             'Similarity': sim
                             })

        return out_filename


def preprocess_tweets(data):
    snow = nltk.stem.SnowballStemmer('english')
    data['Text'].dropna(inplace=True)
    for index, sentence in enumerate(data['Text']):
        # removing html links
        sentence = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', str(sentence))
        # Removing Punctuations
        sentence = re.sub(r'[?|!|\'|\’|"|#|@|_|:|“|”|-|"|-|-|<|>|{|}.|,|)|(|\|/]', r'', sentence)
        sentence = nltk.word_tokenize(sentence)
        # Stemming and removing stopwords
        words = [snow.stem(word) for word in sentence if word not in stopwords.words('english')]
        data.loc[index, 'Text'] = (" ".join(map(str, words)))
    X = data['Text']
    return data


def main():
    SEED = 4000
    outfile_name = get_feature_engineered_data()
    filename = "./" + outfile_name

    dataf = pd.read_csv(filename)
    df = preprocess_tweets(dataf)

    df['Final_Migration_label'] = df['Final_Migration_label'].map({'yes': 1, 'no': 0})

    Text = df['Text']
    Weight = df['Similarity'].fillna(0)
    label = df['Final_Migration_label']

    print(df.Final_Migration_label.value_counts())

    dataf = pd.DataFrame(np.vstack([Text, Weight]).T, columns=['Text', 'Weight'])

    df_train, df_test, y_train, y_test = train_test_split(dataf, label, random_state=SEED)

    # Either Use Count Vectorizer - gives better results
    vect = CountVectorizer()
    vect.fit(df_train.Text)
    X_text_train = vect.transform(df_train.Text)  # documents-terms matrix of training set
    X_text_test = vect.transform(df_test.Text)

    # Or use TFIDF n-grams
    '''tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000)
    tfidf_vect_ngram.fit(df_train.Text)
    X_text_train = tfidf_vect_ngram.transform(df_train.Text)
    X_text_test = tfidf_vect_ngram.transform(df_test.Text)'''

    X_Weight_train = np.atleast_2d(df_train.Weight.astype(float)).T
    X_Weight_test = np.atleast_2d(df_test.Weight.astype(float)).T

    X_train = sp.hstack((X_text_train, X_Weight_train))
    X_test = sp.hstack((X_text_test, X_Weight_test))

    print("------------- METRICS --------------")
    print("LOGISTIC REGRESSION")

    log_model = LogisticRegression()

    log_model = log_model.fit(X=X_train, y=y_train)

    y_pred = log_model.predict(X_test)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)
    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")
    print("Multinomial NB")
    from sklearn.naive_bayes import MultinomialNB

    gnb = MultinomialNB()

    gnb = gnb.fit(X=X_train, y=y_train)

    y_pred = gnb.predict(X_test)

    # sco = log_model.score( y_test , y_pred)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)

    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")

    print("DecisionTreeClassifier")
    from sklearn.tree import DecisionTreeClassifier

    dtc = DecisionTreeClassifier()

    dtc = dtc.fit(X=X_train, y=y_train)

    y_pred = dtc.predict(X_test)

    # sco = log_model.score( y_test , y_pred)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)

    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")

    print("RandomForestClassifier")
    from sklearn import svm

    clf_rf = RandomForestClassifier()

    clf_rf = clf_rf.fit(X=X_train, y=y_train)

    y_pred = clf_rf.predict(X_test)

    # sco = log_model.score( y_test , y_pred)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)

    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")

    print("SVMClassifier")

    clf_rf = svm.SVC(kernel='linear')

    clf_rf = clf_rf.fit(X=X_train, y=y_train)

    y_pred = clf_rf.predict(X_test)

    # sco = log_model.score( y_test , y_pred)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)

    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")

    print("KNN Classifier")

    clf_rf = KNeighborsClassifier()

    clf_rf = clf_rf.fit(X=X_train, y=y_train)

    y_pred = clf_rf.predict(X_test)

    # sco = log_model.score( y_test , y_pred)

    sco = sklearn.metrics.accuracy_score(y_test, y_pred)

    balanced_sco = sklearn.metrics.balanced_accuracy_score(y_test, y_pred)

    f1score = sklearn.metrics.f1_score(y_test, y_pred)

    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1, 0]))
    confusion = pd.DataFrame(conmat, index=['YES', 'NO'],
                             columns=['predicted_YES', 'predicted_NO'])

    precision = sklearn.metrics.precision_score(y_test, y_pred)

    recall = sklearn.metrics.recall_score(y_test, y_pred)
    print("score :", sco)
    print("balanced accuracy :", balanced_sco)

    print("confusion matrix: \n", confusion)

    print("f1score: ", f1score)
    print("precision: ", precision)
    print("recall: ", recall)
    print("------------------------------------")
    # --------------------------------


def compute_similarity(text1, text2):
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)
    return get_cosine(vector1, vector2)


def text_to_vector(text):
    WORD = re.compile(r'\w+')
    words = WORD.findall(text)
    return Counter(words)


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
    sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def get_avg_cosine_similarity(data, keyword_list):
    # Split the data into pairs

    cosine_sim = 0.0

    # Compute Average Content Similarity between pairs of tweets over a given range
    # Summation of [similarity(one pair)/(total pairs in list)]
    for keyword in keyword_list:
        cosine_sim = cosine_sim + compute_similarity(data, keyword)

    return cosine_sim


def get_text_based_features(tweet_text):
    char_count = tweet_text.apply(len)
    word_count = tweet_text.apply(lambda x: len(x.split()))
    word_density = char_count / (word_count + 1)

    noun_count = tweet_text.apply(lambda x: check_pos_tag(x, 'noun'))
    verb_count = tweet_text.apply(lambda x: check_pos_tag(x, 'verb'))
    adj_count = tweet_text.apply(lambda x: check_pos_tag(x, 'adj'))
    adv_count = tweet_text.apply(lambda x: check_pos_tag(x, 'adv'))
    pron_count = tweet_text.apply(lambda x: check_pos_tag(x, 'pron'))

    return word_density, noun_count, verb_count, adj_count, adv_count, pron_count


# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    pos_family = {
        'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
        'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
        'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'adj': ['JJ', 'JJR', 'JJS'],
        'adv': ['RB', 'RBR', 'RBS', 'WRB']
    }
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt


if __name__ == '__main__':
    main()


0    355
1    293
Name: Final_Migration_label, dtype: int64
------------- METRICS --------------
LOGISTIC REGRESSION
score : 0.8641975308641975
balanced accuracy : 0.864367816091954
confusion matrix: 
      predicted_YES  predicted_NO
YES             65            10
NO              12            75
f1score:  0.8552631578947368
precision:  0.8441558441558441
recall:  0.8666666666666667
------------------------------------
Multinomial NB
score : 0.8271604938271605
balanced accuracy : 0.8298850574712644
confusion matrix: 
      predicted_YES  predicted_NO
YES             65            10
NO              18            69
f1score:  0.8227848101265823
precision:  0.7831325301204819
recall:  0.8666666666666667
------------------------------------
DecisionTreeClassifier
score : 0.8395061728395061
balanced accuracy : 0.8358620689655172
confusion matrix: 
      predicted_YES  predicted_NO
YES             59            16
NO              10            77
f1score:  0.8194444444444444
precision:  

