In [None]:
import final_metric
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import pickle

In [None]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data.target = data.target.apply(lambda x: 1 if x > 0.5 else 0)
    return data

In [None]:
def preprocess(text_data):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    return vectorizer.fit_transform(text_data)

In [None]:
def train_models(data, test_size=0.2, random_state=42):
    df_train, df_valid = train_test_split(
        data, test_size=test_size, random_state=random_state, stratify=data.target.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    X_train = df_train["comment_text"]
    X_valid = df_valid["comment_text"]
    # use this instead, will have to modify load_data and preprocess
    # X_train = preprocess(df_train["comment_text"])
    # X_val = preprocess(df_valid["comment_text"])

    y_train = df_train["target"]
    y_valid = df_valid["target"]

    models = [
        LogisticRegression(max_iter=1000),
        MultinomialNB(),
        RandomForestClassifier(),
        LinearSVC(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier()
    ]
    model_names = [
        'Logistic Regression',
        'Naive Bayes',
        'Random Forest',
        'Linear SVC',
        'KNeighbors',
        'Decision Tree',
        'MLP'
    ]

    best_auc = 0
    best_model = None
    best_model_name = None

    for model, name in zip(models, model_names):
        print(f'Training {name}...')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        auc = roc_auc_score(y_valid, y_pred)
        print(f'{name} AUC: {auc}')

        if auc > best_auc:
            best_auc = auc
            best_model = model
            best_model_name = name

    print(f'Best model: {best_model_name} with AUC: {best_auc}')
    with open(f'best_model.bin', 'wb') as f:
        pickle.dump(best_model, f)

    # make final metric call and print it
    # metric_value = final_metric.get_value(df_valid, valid_outputs, model_name)

In [None]:
def predict(text, model_path='best_model.bin'):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    X = vectorizer.fit_transform([text])
    return model.predict(X)[0]

In [None]:
if __name__ == '__main__':
    data = load_data('data/train.csv')

    train_models(data)

    # add vizualizations 

    # Example 1 usage of the predict function
    example_text = 'Immigrants don\'t get deported. ILLEGAL immigrants do.'
    prediction = predict(example_text)
    print(f'Prediction for "{example_text}": {prediction}')
    
    # Example 2 usage of the predict function
    example_text = 'You should burn in hell.'
    prediction = predict(example_text)
    print(f'Prediction for "{example_text}": {prediction}')