In [50]:
import pandas as pd
import numpy as np

In [51]:
import warnings
warnings.filterwarnings("ignore")

In [52]:
df = pd.read_csv('files/cyberbullying_tweets.csv')

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity


In [53]:
df = pd.get_dummies(df, columns = ['cyberbullying_type'])

In [54]:
new_cols = ['tweet_text', 'age','ethnicity','gender','not_cyberbullying','other_cyberbullying','religion']
df.columns = new_cols

## For a binary classifier, we just need to know what is cyberbullying, and what's not cyberbullying

In [55]:
df = df.drop(columns = ['age','ethnicity','gender','other_cyberbullying','religion'])

## I'd prefer to have my target be something that evaluates True for potentially harmful, not for 'not cyberbullying', so I'm going to switch that up.

In [56]:
df['potentially_harmful'] = [int(not val) for val in df['not_cyberbullying']]
df = df.drop(columns = ['not_cyberbullying'])

## I'm going to see if I can get away with not lemmatizing when I preprocess the text, because it would take ages to run this, and I'm doing this all on my local machine.

In [57]:
from data_preprocessing.preprocess_text import pre_process_text
import tqdm

df['tweet_text'] = [pre_process_text(text, lemmatizer = 'False') for text in tqdm.tqdm(df['tweet_text'])]

100%|██████████| 47692/47692 [00:02<00:00, 16210.65it/s]


In [58]:
x = df['tweet_text']
y = df['potentially_harmful']

In [59]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0)

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import set_config

set_config(display='diagram')


In [61]:
from sklearn.preprocessing import FunctionTransformer

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

In [62]:
pipe.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'clf': LogisticRegression(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__l1_ratio': None,
 'clf__max_iter': 100,
 'clf__multi_class': 'auto',
 'clf__n_jobs': None,
 'clf_

In [63]:
grid_params = {
    "tfidf__ngram_range" : [(1,1),(1,2)],
    "tfidf__max_df" : [0.1,1],
    "tfidf__use_idf" : [True, False],
    "tfidf__norm": ['l1','l2'],
    "clf__C" : [0.1,1],
    "clf__solver" : ['liblinear','lbfgs','sag'],
    "clf__penalty": ['l1','l2','elasticnet','none'],
}


In [64]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV


gs = GridSearchCV(pipe, grid_params, scoring = 'accuracy', cv = StratifiedKFold(n_splits = 5), verbose = 1, n_jobs = -1)

In [65]:
gs.fit(x,y)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


In [None]:
best_params = gs.best_params_
best_score = gs.best_score_

In [None]:
best_params

{'clf__C': 0.1,
 'clf__penalty': 'l1',
 'clf__solver': 'liblinear',
 'tfidf__max_df': 0.1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l1',
 'tfidf__use_idf': False}

In [None]:
best_score

0.8416086555397131

In [None]:
best_pipe = gs.best_estimator_

In [None]:
x_train = x_train.tolist()
y_train = y_train.tolist()
x_test = x_test.to_list()
y_test = y_test.to_list()

In [None]:
best_pipe.fit(x_train, y_train)

In [None]:
best_pipe.score(X=x_test, y= y_test)

0.8504036062480343