In [2]:
%load_ext autoreload

In [39]:
%autoreload 2
import pandas as pd

from sklearn import model_selection
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import metrics
from sklearn import svm
from sklearn import ensemble
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#local package
import text_processing

In [24]:
#Import data
tweets, markers = text_processing.import_data('sexist_tweets.json', 'tweet_sample.json')
#Separate into train and test sets
train_x, validate_x, train_y, validate_y = model_selection.train_test_split(tweets, markers)

In [29]:
#SET UP WORD FREQUENCY FEATURES

# Word count w/CountVectorizer
countVect = CountVectorizer(analyzer='word', tokenizer=text_processing.tokenize_status_text)
countVect.fit(tweets)
# transform the training and validation data using count vectorizer object
xtrain_count =  countVect.transform(train_x)
xvalid_count =  countVect.transform(validate_x)

# word-level w/TfidfVectorizer
tfidfVect = TfidfVectorizer(analyzer='word', max_features=5000,
                            tokenizer=text_processing.tokenize_status_text)
tfidfVect.fit(tweets)
xtrain_tfidf =  tfidfVect.transform(train_x)
xvalid_tfidf =  tfidfVect.transform(validate_x)

# ngram-level w/TfidfVectorizer
tfidfVect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=5000,
                                   tokenizer=text_processing.tokenize_status_text)
tfidfVect_ngram.fit(tweets)
xtrain_tfidf_ngram =  tfidfVect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidfVect_ngram.transform(validate_x)

# char-level w/TfidfVectorizer
tfidfVect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000,
                                         tokenizer=text_processing.tokenize_status_text)
tfidfVect_ngram_chars.fit(tweets)
xtrain_tfidf_ngram_chars =  tfidfVect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidfVect_ngram_chars.transform(validate_x)

In [36]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, validate_y)

In [38]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

LR, Count Vectors:  0.9673684210526315
LR, WordLevel TF-IDF:  0.9596491228070175
LR, N-Gram Vectors:  0.8785964912280702
LR, CharLevel Vectors:  0.9568421052631579
SVM, N-Gram Vectors:  0.7529824561403509
RF, Count Vectors:  0.9385964912280702
RF, WordLevel TF-IDF:  0.9547368421052631


In [44]:
xtrain_count.shape

(8548, 25960)