In [1]:
#!python -m spacy download de_core_news_md --user
#!python -m spacy download en_core_web_lg --user
# nltk.download('vader_lexicon')
#!pip install --user xgboost

In [4]:
en_nlp = spacy.load("en_core_web_lg")
de_nlp = spacy.load("de_core_news_md")

In [3]:
import re
import spacy

#!python -m spacy download de_core_news_md
#!python -m spacy download en_core_web_lg
import nltk

# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download('vader_lexicon')
# nltk.download("averaged_perceptron_tagger")
##from nltk import pos_tag, pos_tag_sents, word_tokenize, sent_tokenize
##from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load

import warnings

warnings.filterwarnings("ignore")

In [5]:
df_deutsch = pd.read_csv("deutsch_stances.csv", index_col=0)
df_deutsch.reset_index(inplace=True, drop=True)

In [8]:
df_english = pd.read_csv("english_stances.csv", index_col=0)
df_english.reset_index(inplace=True, drop=True)

In [9]:
def stemmer(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        if item.isdigit():
            continue
        elif item.isalnum():
            stems.append(PorterStemmer().stem(item))
    return stems


def clean_text(text):
    website_pattern = re.compile(r"\((.*?)\)")
    slash_pattern = re.compile(r"[\[\]]")
    text = re.sub(website_pattern, "", text)
    text = re.sub(slash_pattern, "", text)
    return text


def generate_base(df, column, language, model="glove"):
    lang = "en" if language == "english" else "de"

    if model == "glove":
        nlp = en_nlp if language == "english" else de_nlp
        embeddings = np.array([nlp(x).vector for x in list(df[column].values)])
        shape = embeddings.shape[1]
        columns = ["{}_dimension_{}".format(column, i) for i in range(shape)]
        ff = pd.DataFrame(data=embeddings, columns=columns)

    elif model == "tfidf-stemmed":
        model = load("tfidf_" + lang + "_stemmed.sav")
        data = model.transform(df[column])
        columns = [column + ":" + col for col in model.get_feature_names()]
        ff = pd.SparseDataFrame(data=data, columns=columns).fillna(0)

    elif model == "tfidf-unstemmed":
        model = load("tfidf_" + lang + "_unstemmed.sav")
        data = model.transform(df[column])
        columns = [column + ":" + col for col in model.get_feature_names()]
        ff = pd.SparseDataFrame(data=data, columns=columns).fillna(0)

    return ff


def generate_additional(df, column, language, modes=["pos", "ner", "sentiment"]):
    nlp = en_nlp if language == "english" else de_nlp
    docs = [nlp(x) for x in list(df[column].values)]
    n = len(df)
    dfs = []

    # use pos tags
    if "pos" in modes:
        # research tag by using spacy.explain: spacy.explain("ADP")
        pos_tags = {
            "PRON": [0] * n,
            "ADV": [0] * n,
            "ADJ": [0] * n,
            "ADP": [0] * n,
            "DET": [0] * n,
            "AUX": [0] * n,
            "VERB": [0] * n,
            "NOUN": [0] * n,
            # "PUNCT": [0]*n,
            "NUM": [0] * n,
        }

        for i, doc in enumerate(docs):
            for token in doc:
                if token.pos_ in pos_tags.keys():
                    pos_tags[token.pos_][i] += 1
        tf = pd.DataFrame.from_dict(pos_tags)
        tf.columns = [column + ":" + col for col in tf.columns]
        dfs.append(tf)

    # use sentiment tas: negative, neutral, positive and compound
    if "sentiment" in modes:
        sentiment = [sid.polarity_scores(x) for x in list(df[column].values)]
        tf = pd.DataFrame(data=sentiment)
        tf.columns = [column + ":" + col for col in tf.columns]
        dfs.append(tf)

    # use named entity recognition:
    if "ner" in modes:
        ner_types = {
            "PERSON": [0] * n,
            "NORP": [0] * n,
            "FAC": [0] * n,
            "ORG": [0] * n,
            "GPE": [0] * n,
            "LOC": [0] * n,
            "PRODUCT": [0] * n,
            "EVENT": [0] * n,
            "WORK_OF_ART": [0] * n,
            "LAW": [0] * n,
            "LANGUAGE": [0] * n,
            "QUANITY": [0] * n,
            "ORDINAL": [0] * n,
            "CARDINAL": [0] * n,
        }
        for i, doc in enumerate(docs):
            for entity in doc.ents:
                if entity.label_ in ner_types.keys():
                    ner_types[entity.label_][i] += 1
        tf = pd.DataFrame.from_dict(ner_types)
        tf.columns = [column + ":" + col for col in tf.columns]
        dfs.append(tf)
    if "structure" in modes:
        pass

    return pd.concat(dfs, axis=1)


def prep_dataset(df, model, language, modes=[]):
    dfs = []
    df_stance = pd.concat([df["stance"]], axis=1)
    df_stance["stance"] = df_stance.stance.apply(lambda x: 1 if x == "RA" else 0)
    dfs.append(df_stance)
    dfs.append(generate_base(df, "child_text", model=model, language=language))
    dfs.append(generate_base(df, "parent_text", model=model, language=language))
    if modes != []:
        dfs.append(
            generate_additional(df, "child_text", language=language, modes=modes)
        )
        dfs.append(
            generate_additional(df, "parent_text", language=language, modes=modes)
        )
    return pd.concat(dfs, axis=1)

In [10]:
%%time
df = prep_dataset(
    df_english, model="glove", language="english", modes=["ner", "sentiment", "pos"]
)
df.to_csv("english_features.csv")
df

CPU times: user 1h 49min 53s, sys: 25.7 s, total: 1h 50min 19s
Wall time: 1h 50min 22s


Unnamed: 0,stance,child_text_dimension_0,child_text_dimension_1,child_text_dimension_2,child_text_dimension_3,child_text_dimension_4,child_text_dimension_5,child_text_dimension_6,child_text_dimension_7,child_text_dimension_8,...,parent_text:GPE,parent_text:LOC,parent_text:PRODUCT,parent_text:EVENT,parent_text:WORK_OF_ART,parent_text:LAW,parent_text:LANGUAGE,parent_text:QUANITY,parent_text:ORDINAL,parent_text:CARDINAL
0,1,0.130623,0.123188,-0.145908,-0.063913,0.054993,-0.070628,0.079733,-0.099298,-0.040390,...,0,0,0,1,0,0,0,0,0,0
1,1,-0.026860,0.073878,-0.106161,-0.020275,0.068989,-0.097539,0.033404,-0.013669,0.008755,...,0,0,0,0,0,0,0,0,0,0
2,0,-0.045085,0.203527,-0.013752,-0.014270,0.108635,-0.053340,0.006194,-0.019230,-0.018006,...,0,0,0,1,0,0,0,0,0,0
3,1,-0.051589,0.114056,0.036334,0.101815,0.059305,0.004370,0.114249,0.052089,-0.070493,...,0,0,0,1,0,0,0,0,0,0
4,1,-0.017841,0.077363,0.012894,0.025585,0.057311,0.079908,0.061698,0.027671,0.042127,...,0,0,0,1,0,0,0,0,0,0
5,1,-0.072473,0.264192,-0.021959,0.058564,0.143847,0.037288,0.083770,0.097194,-0.134935,...,0,0,0,1,0,0,0,0,0,0
6,1,0.032646,0.210633,-0.051666,-0.017457,0.073150,0.030412,0.055569,0.028691,-0.090199,...,0,0,0,1,0,0,0,0,0,0
7,1,-0.016048,0.193543,-0.047200,0.041230,0.072811,0.004104,0.081252,0.016777,-0.073764,...,0,0,0,1,0,0,0,0,0,0
8,1,0.032561,0.212251,-0.077250,-0.098701,0.041422,-0.010742,0.056119,0.031253,-0.038585,...,0,0,0,0,0,0,0,0,0,0
9,1,0.028982,0.188044,-0.062851,-0.015534,0.099448,0.019276,0.138586,0.075137,-0.077126,...,0,0,0,0,0,0,0,0,0,0


In [11]:
%%time
df_de = prep_dataset(
    df_deutsch, model="glove", language="german", modes=["ner", "sentiment", "pos"]
)
df_de.to_csv("german_features.csv")
df_de

CPU times: user 1h 51min 45s, sys: 24.3 s, total: 1h 52min 9s
Wall time: 1h 52min 11s


Unnamed: 0,stance,child_text_dimension_0,child_text_dimension_1,child_text_dimension_2,child_text_dimension_3,child_text_dimension_4,child_text_dimension_5,child_text_dimension_6,child_text_dimension_7,child_text_dimension_8,...,parent_text:GPE,parent_text:LOC,parent_text:PRODUCT,parent_text:EVENT,parent_text:WORK_OF_ART,parent_text:LAW,parent_text:LANGUAGE,parent_text:QUANITY,parent_text:ORDINAL,parent_text:CARDINAL
0,1,0.124905,0.053605,0.196905,0.090121,-0.083347,0.238456,-0.111795,-0.248336,0.162668,...,0,0,0,0,0,0,0,0,0,0
1,1,0.117420,0.080640,0.213117,0.061254,-0.097151,0.257877,-0.168999,-0.199807,0.145528,...,0,0,0,0,0,0,0,0,0,0
2,0,0.099127,0.093631,0.142833,0.124882,-0.091612,0.346630,-0.038201,-0.216618,0.182457,...,0,0,0,0,0,0,0,0,0,0
3,1,0.117486,0.038216,0.138340,0.054034,-0.029202,0.304484,-0.104435,-0.231509,0.096917,...,0,0,0,0,0,0,0,0,0,0
4,1,0.128663,0.039507,0.151170,0.106024,0.025180,0.306350,-0.045709,-0.201753,0.073362,...,0,0,0,0,0,0,0,0,0,0
5,1,0.198094,0.021125,0.180409,0.109653,0.092722,0.417828,-0.048576,-0.268687,0.000954,...,0,0,0,0,0,0,0,0,0,0
6,1,0.099752,0.056971,0.139890,0.097432,0.009549,0.345779,-0.116704,-0.176844,0.072133,...,0,0,0,0,0,0,0,0,0,0
7,1,0.202727,0.023417,0.127931,0.079199,0.082009,0.366207,-0.085083,-0.222480,0.057391,...,0,0,0,0,0,0,0,0,0,0
8,1,0.116264,0.035508,0.157297,0.106570,-0.059643,0.260873,-0.112695,-0.194009,0.178929,...,0,0,0,0,0,0,0,0,0,0
9,1,0.178199,0.023235,0.168791,0.079819,0.005933,0.292294,-0.100062,-0.213439,0.029970,...,0,1,0,0,0,0,0,0,0,0


In [16]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb

# random state value
rsv = 42
# cpus used for training
n_jobs = -1

models = {  # "SVG": SVC(probability=True,random_state=rsv),
    "LogReg": LogisticRegression(random_state=rsv, n_jobs=n_jobs),
    "RanFor": RandomForestClassifier(random_state=rsv, n_jobs=n_jobs),
    # "GausNB": GaussianNB(),
    # "LDA": LinearDiscriminantAnalysis(),
    "KNN": KNeighborsClassifier(n_jobs=n_jobs),
    "XGBOOST": xgb.XGBClassifier(n_jobs=n_jobs, random_state=rsv),
}


# split in training data matrix X and target y
def generate_cv_sets(df: pd.DataFrame):
    X = df.loc[:, df.columns != "stance"]
    y = df[["stance"]].values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    return X_train, X_test, y_train, y_test

In [24]:
%%time
X_train, X_test, y_train, y_test = generate_cv_sets(df.iloc[:, :-54])
results = {}
for name, model in models.items():
    model.fit(X_train.fillna(0), y_train)
    score = model.score(X_test.fillna(0), y_test)
    results[name] = {"score": score}
    print(name, score)

LogReg 0.6637283846478279
RanFor 0.6475959510754956
KNN 0.6297975537747785
XGBOOST 0.6749683677773092
CPU times: user 1d 10h 38min 23s, sys: 1min 3s, total: 1d 10h 39min 27s
Wall time: 1h 10min 50s


In [25]:
%%time
X_train, X_test, y_train, y_test = generate_cv_sets(df_de.iloc[:, :-54])
results_de = {}
for name, model in models_de.items():
    model.fit(X_train.fillna(0), y_train)
    score = model.score(X_test.fillna(0), y_test)
    results_de[name] = {"score": score}
    print(name, score)

LogReg 0.6602066638549136
RanFor 0.6410586250527204
KNN 0.6152045550400674
XGBOOST 0.6679038380430198
CPU times: user 1d 11h 34min 45s, sys: 2min 6s, total: 1d 11h 36min 51s
Wall time: 1h 13min 16s
