In [1]:
# imports
import sys  
sys.path.insert(0, '../../../TwitterCoordinatedInorganicAgents')

import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModel
from transformers import AutoTokenizer
import numpy as np
from tweetcore.tasks.postgres_target import download_data
from data_builders.tasks import collapse_embeddings
import credentials_refactor
import global_settings as gs
import gc

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_curve, auc, confusion_matrix

# Getting data ready

In [2]:
# data
conf = credentials_refactor.return_credentials()
data = download_data.pandas_df_from_postgre_query(configuration = conf ,
                                                  query = '''
                                                          select *
                                                          from redacted_tables.features_user_classification
                                                          ''')

# User-based classifier

In [5]:
user_based_features = [i for i in data.columns if 'uuu' in i]

In [10]:
df_user = data[user_based_features + ["target"]].copy()

In [14]:
df_user.fillna(-1,inplace=True)


In [22]:
cast_to_int = ['uuu_year_joined', 'uuu_month_joined', 'uuu_day_of_month_joined', 'uuu_day_of_week_joined', 'uuu_hour_joined']
for i in cast_to_int:
    df_user.loc[:,i] = df_user[i].astype(int)

In [23]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 35 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   uuu_year_joined                        1000000 non-null  int32  
 1   uuu_month_joined                       1000000 non-null  int32  
 2   uuu_day_of_month_joined                1000000 non-null  int32  
 3   uuu_day_of_week_joined                 1000000 non-null  int32  
 4   uuu_joined_weekend                     1000000 non-null  int64  
 5   uuu_hour_joined                        1000000 non-null  int32  
 6   uuu_username_length                    1000000 non-null  int64  
 7   uuu_fraction_numbers_username          1000000 non-null  float64
 8   uuu_fraction_emojis_username           1000000 non-null  float64
 9   uuu_fraction_capital_letters_username  1000000 non-null  float64
 10  uuu_fraction_special_char_username     1000

In [27]:
X = df_user.drop(columns="target").copy()
y = df_user.target.copy()

X_train, X_test, y_train, y_test = train_test_split(
                                                     X, 
                                                     y, 
                                                     test_size=0.35, 
                                                     random_state=12
                                                    )

In [31]:
# test different models

rf = RandomForestClassifier()
#lr = LogisticRegression(max_iter=1000)
kn = KNeighborsClassifier()
xgb = XGBClassifier()

all_models = {"RandomForest":rf, "K-Neighbors":kn, "XGBoost":xgb}

for name, model in zip(all_models.keys(),all_models.values()):
    cv_results = cross_validate(model, 
                            X_train, 
                            y_train, 
                            cv=5,
                            scoring=["f1"])
    print(name + " with mean f1 over 5 folds of: " + str(round(np.mean(cv_results["test_f1"]),4)) + " and sigma accross test metrics of: " + str(round(np.std(cv_results["test_f1"]),4)))

RandomForest with mean accuracy over 5 folds of: 0.2522 and sigma accross test metrics of: 0.0021


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression with mean accuracy over 5 folds of: 0.0206 and sigma accross test metrics of: 0.0037
K-Neighbors with mean accuracy over 5 folds of: 0.2452 and sigma accross test metrics of: 0.0023
XGBoost with mean accuracy over 5 folds of: 0.2789 and sigma accross test metrics of: 0.0014
