In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from joblib import dump, load

In [9]:
df_deutsch = pd.read_csv("deutsch_stances.csv", index_col=0)
df_deutsch.sample(1)

Unnamed: 0,child_text,parent_text,stance
27,Das ist ganz normal für die gesamte Gesundheit...,Die Ärzte haben vielleicht keine wirkliche Mög...,CA


In [10]:
df_english = pd.read_csv("english_stances.csv", index_col=0)
df_english.sample(1)

Unnamed: 0,child_text,parent_text,stance
52,If there would be overpopulation humankind wou...,There is no overpopulation.,RA


In [11]:
def stemmer(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        if item.isdigit():
            continue
        elif item.isalnum():
            stems.append(PorterStemmer().stem(item))
    return stems


def tfidf(data: pd.Series, lang, use_stemming=False):
    tokenizer = stemmer if use_stemming else None

    model = TfidfVectorizer(
        min_df=30,
        tokenizer=tokenizer,
        stop_words=None,
        token_pattern=r"(?u)\b[A-Za-z]+\b",
    )

    model.fit(data)

    return model

In [12]:
# prep data
data_deutsch = df_deutsch["child_text"] + df_deutsch["parent_text"]

# train german models
## tfidf unstemmed, stop words included
dump(
    tfidf(data=data_deutsch, lang="german", use_stemming=False),
    "tfidf_de_unstemmed.sav",
)

## tfidf stemmed, stop words included
dump(tfidf(data=data_deutsch, lang="german", use_stemming=True), "tfidf_de_stemmed.sav")

['tfidf_de_stemmed.sav']

In [13]:
# prep data
data_english = df_english["child_text"] + df_english["parent_text"]

# train english models
## tfidf unstemmed, stop words included
dump(
    tfidf(data=data_english, lang="english", use_stemming=False),
    "tfidf_en_unstemmed.sav",
)

## tfidf stemmed, stop words included
dump(
    tfidf(data=data_english, lang="english", use_stemming=True), "tfidf_en_stemmed.sav"
)

['tfidf_en_stemmed.sav']

In [14]:
# pred test
model = load("tfidf_de_stemmed.sav")
x = model.transform(df_deutsch["child_text"])

In [18]:
columns = model.get_feature_names()
sparse = pd.SparseDataFrame(data=x, columns=columns).fillna(0)

sparse.sample(5)

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  sparse = pd.SparseDataFrame(data = x, columns=columns).fillna(0)
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  sdict[columns[col]] = SparseSeries(
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  column: SparseSeries(
Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return self._constructor(new_data).__finalize__(self)
Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html

Unnamed: 0,1920er,1930er,1940er,1950er,1960er,1970er,1980er,1990er,20a,20and,...,überzeugungen,üblich,üblichen,üblicher,üblicherweis,übrig,übrigen,übt,übung,übungen
4538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
columns

['1920er',
 '1930er',
 '1940er',
 '1950er',
 '1960er',
 '1970er',
 '1980er',
 '1990er',
 '20a',
 '20and',
 '20bi',
 '20conflict',
 '20death',
 '20der',
 '20die',
 '20disaggreg',
 '20i',
 '20in',
 '20new',
 '20of',
 '20on',
 '20problem',
 '20the',
 '20to',
 '20und',
 '20von',
 '2c',
 '2f',
 '3a',
 '3d',
 '3dihub',
 '3famp',
 '70er',
 'a',
 'ab',
 'abbau',
 'abbauen',
 'abbruch',
 'abdecken',
 'abdeckt',
 'abdeckung',
 'aber',
 'aberglauben',
 'abfal',
 'abfäll',
 'abfällig',
 'abgab',
 'abgebaut',
 'abgeben',
 'abgebrochen',
 'abgedeckt',
 'abgegeben',
 'abgehalten',
 'abgelegenen',
 'abgelehnt',
 'abgeleitet',
 'abgelenkt',
 'abgemildert',
 'abgenommen',
 'abgeordnet',
 'abgeordneten',
 'abgerissen',
 'abgeschafft',
 'abgeschlossen',
 'abgeschnitten',
 'abgeschoben',
 'abgeschreckt',
 'abgeschwächt',
 'abgesehen',
 'abgestimmt',
 'abgetan',
 'abgetrieben',
 'abgewiesen',
 'abgewogen',
 'abgeworfen',
 'abgewählt',
 'abgezogen',
 'abgleiten',
 'abgrenzung',
 'abhalten',
 'abhang',
 'abhi