### Machine Learning classifier baselines
Run some basic ML classifiers (Random Forest, SVC, Multi Layer Perceptron and regression) over our data as a basline.

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [4]:
# Provide functions for running our classifiers.

def RF(X_train, X_test, y_train):
    clf = RandomForestClassifier(random_state=1, class_weight="balanced").fit(
        X_train, y_train
    )
    preds = clf.predict(X_test)
    return preds, clf.feature_importances_


def SVM(X_train, X_test, y_train):
    clf = SVC(random_state=1, class_weight="balanced").fit(X_train, y_train)
    preds = clf.predict(X_test)
    weights = clf.class_weight_
    return preds, weights


def MLP(X_train, X_test, y_train):
    clf = MLPClassifier(random_state=1, solver="lbfgs", max_iter=5000).fit(
        X_train, y_train
    )
    preds = clf.predict(X_test)
    probas = clf.predict_proba(X_test)
    return preds, probas


def regression(X_train, X_test, y_train):
    clf = LogisticRegression(
        random_state=0, max_iter=5000, class_weight="balanced", solver="lbfgs"
    )
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return preds, clf.coef_

In [11]:
# Setup train and test data. 
# Copied over from models.ipynb so we have the same train/test splits. 
df = pd.read_csv('edos_labelled_aggregated.csv')
df = df.drop(columns=["rewire_id", "label_category", "label_vector", "split"])
df = df.rename(columns={"label_sexist": "label"})
df['label'] = df['label'].map({'not sexist': 0, 'sexist': 1})

train_size = 0.8
train_data = df.sample(frac=train_size, random_state=200).reset_index(drop=True)
test_data = df.drop(train_data.index).reset_index(drop=True)
print(train_data.shape, test_data.shape)
print("TRAIN SPLIT:")
print("not sexist: ", len(train_data[train_data["label"] == 0]))
print("sexist: ", len(train_data[train_data["label"] == 1]))
print("TEST SPLIT:")
print("not sexist: ", len(test_data[test_data["label"] == 0]))
print("sexist: ", len(test_data[test_data["label"] == 1]))

(16000, 2) (4000, 2)
TRAIN SPLIT:
not sexist:  12113
sexist:  3887
TEST SPLIT:
not sexist:  3141
sexist:  859


In [None]:
# Run classifier baslines. 

# X_train -> Sätze aus Train
# X_test -> Nur sentences vom Testset
# Y_train -> Preds aus Train

X_train = train_data["text"]
X_test = test_data["text"]
Y_train = train_data["label"]

rf_result = RF(X_train, X_test, Y_train)
rf_result.preds

ValueError: could not convert string to float: 'I was never on Twitter, but they still hate me.'

In [None]:
svg_results = 
mlp_results = 
regression_results = 