Importing Libraries

In [13]:
import numpy as np
import pandas as pd
import markov
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score
from markov.api.model import ModelRecorder   # importing Model recorder from markov, used for experiment recording
from markov.api.recording.experiments.integrations.keras.keras_auto_record import auto_record
#from markov.api.recording.integrations.keras.keras_auto_record import auto_record
from markov.api.schemas.model_recording import ModelRecordingConfig, SingleTagInferenceRecord
import scikitplot as skplt
import re
import os
from sklearn.preprocessing import MultiLabelBinarizer
import math

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
pip install scikit-plot

Collecting scikit-plot
  Using cached scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7
Note: you may need to restart the kernel to use updated packages.


In [3]:
data1 = pd.read_csv(os.path.join(os.path.curdir, "Datasets", "master_dataset.csv")).fillna(' ')

In [4]:
df = data1[data1.label_sexist == 'sexist']
ndf = data1[data1.label_sexist == 'not sexist']
add_df = ndf.sample(2000)
frames = [add_df,df]
data = pd.concat(frames)

Data Preprocessing

In [16]:
def clean_text(text):
    """Removes HTML tags and removes punctuation from the text"""
    text = text.lower() #lower case
    text = re.sub(r'http\S+', '', text) # remove http links
    text = re.sub(r'www\S+', '', text)  # remove www website
    text = re.sub(r'<.*?>', '', text)  # remove html tags
    text = re.sub(r'[^\w\s]', '', text) # remove special characters like !,@,#,$,%
    text = re.sub('\s+', ' ', text) # replace multiple space by single space
    return text

def remove_stopword(text, stopwords):
    """Removes common words such as "the" and "a" from the text"""    
    return " ".join([word for word in text.split() if word not in (stop_words)])
  
def lemma_text(text, lemmatizer):
    """Reduces words to their base forms using lemmatization"""
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenize(text)]
    return " ".join(lemmatized_words)

def stem_text(text, stemmer):
    """Reduces words to their base forms using the provided stemmer"""
    stemmed_words = [stemmer.stem(word) for word in tokenize(text)]
    return " ".join(stemmed_words)

def tokenize(text):
    """Splits the text into individual words"""
    return text.split()

def process_text(text, lemmatizer, stemmer, stop_words):
    text = clean_text(text)
    text = remove_stopword(text, stop_words)
    #text = lemma_text(text, lemmatizer)
    #text = stem_text(text, stemmer)
    return text #tokenize(text) #START_TOKEN + " ".join(tokenize(text)) + STOP_TOKEN
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [17]:
# stores the result in a new column called 'processed_text'
data["processed_text"] = data['text'].apply(process_text, lemmatizer = lemmatizer, stemmer = stemmer, stop_words = stop_words)
data.head(5)

Unnamed: 0,text,label_sexist,processed_text
9180,Nobody cares about height if you are taller th...,not sexist,nobody cares height taller girl thats enough f...
18640,More people would favor the women’s movement i...,not sexist,people would favor womens movement knew
18861,MENTION4248 MENTION2511 i love those too much....,not sexist,mention4248 mention2511 love much much
10798,"Ive watched a couple of her videos, and she's ...",not sexist,ive watched couple videos shes kind depressing...
3979,Not even getting into genders Does he realize ...,not sexist,even getting genders realize even amongst anim...


In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(data['processed_text'], data['label_sexist'],stratify=data['label_sexist'], test_size=0.25)

In [8]:
labels = [str(i).split('/') for i in train_Y]
text= train_X

In [20]:
"""The code creates a Tf-idf vectorizer with a minimum document frequency of 5, 
applies it to a list of texts, then creates a multilabel binarizer and 
fits it to a list of labels. It creates arrays for the input and output features, and 
splits the data into training and test sets"""

tfidfvectorizer = TfidfVectorizer(min_df = 5) #max_features=3000
x_tfidf = tfidfvectorizer.fit_transform(text).toarray()
mlb = MultiLabelBinarizer()
mlb.fit(labels)
Y = mlb.transform(labels)
n_op_features = len(Y[0])
train_x,test_x,train_y,test_y = train_test_split(x_tfidf,Y,test_size=0.2)
n_ip_features = len(train_x[0])