In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,LeaveOneOut,KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm import tqdm_notebook, tqdm

drive.mount('/content/drive')
path = 'drive/My Drive/blogtext_preprocessed.csv'
blog_data = pd.read_csv(path).iloc[:10000,]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def createBasicFeatures(data):
  data.dropna(inplace=True)
  data.replace({'gender': {'male': 1,'female': 0}},inplace=True)
  temp_text = data['text'].to_list()
  classes = data['gender'].to_list()
  # vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\d*[a-zA-Z][a-zA-Z0-9]*')
  vectorizer = TfidfVectorizer(analyzer='word',token_pattern=r'\d*[a-zA-Z][a-zA-Z0-9]*')
  X = vectorizer.fit_transform(temp_text)
  vocab = vectorizer.get_feature_names()
  texts = X.toarray()
  return texts,classes,vocab

def evaluateModel(X,y,vocab,penalty="l1"):
  # create and fit the model
  model = LogisticRegression(penalty=penalty,solver="liblinear")
  results = cross_validate(model,X,y,cv=KFold(n_splits=10, shuffle=True, random_state=1))
  
  # determine the average accuracy
  scores = results["test_score"]
  avg_score = sum(scores)/len(scores)
  
  # determine the most informative features
  # this requires us to fit the model to everything, because we need a
  # single model to draw coefficients from, rather than 26
  model.fit(X,y)
  class0_weight_sorted = model.coef_[0, :].argsort()
  class1_weight_sorted = (-model.coef_[0, :]).argsort()

  termsToTake = 20
  class0_indicators = [vocab[i] for i in class0_weight_sorted[:termsToTake]]
  class1_indicators = [vocab[i] for i in class1_weight_sorted[:termsToTake]]

  if model.classes_[0] == "pos":
    return avg_score,class0_indicators,class1_indicators
  else:
    return avg_score,class1_indicators,class0_indicators

def runEvaluation(X,y,vocab):
  print("----------L1 Norm-----------")
  avg_score,pos_indicators,neg_indicators = evaluateModel(X,y,vocab,"l1")
  print("The model's average accuracy is %f"%avg_score)
  print("The most informative terms for pos are: %s"%pos_indicators)
  print("The most informative terms for neg are: %s"%neg_indicators)
  #this call will fit a model with L2 normalization
  print("----------L2 Norm-----------")
  avg_score,pos_indicators,neg_indicators = evaluateModel(X,y,vocab,"l2")
  print("The model's average accuracy is %f"%avg_score)
  print("The most informative terms for pos are: %s"%pos_indicators)
  print("The most informative terms for neg are: %s"%neg_indicators)

In [9]:
X,y,vocab = createBasicFeatures(blog_data)
runEvaluation(X, y, vocab)

----------L1 Norm-----------
The model's average accuracy is 0.751522
The most informative terms for pos are: ['duf', 'hal', 'dj', 'f', 'ben', 'panda', 'pandyland', 'stoner', 'chicago', 'agre', 'korean', 'album', 'angi', 'cowork', 'johnathan', 'seoul', 'ash', 'bb', 'sinc', 'film']
The most informative terms for neg are: ['diva', 'ked', 'lar', 'dun', 'lol', 'heart', 'quizilla', 'chantel', 'rachel', 'buddi', 'fall', 'ppl', 'hurt', 'beach', 'eg', 'jame', 'sit', 'sarah', 'theo', 'poem']
----------L2 Norm-----------
The model's average accuracy is 0.764718
The most informative terms for pos are: ['duf', 'dj', 'hal', 'ben', 'panda', 'stoner', 'angi', 'pandyland', 'korean', 'fer', 'agre', 'f', 'johnathan', 'album', 'hey', 'sinc', 'play', 'bush', 'but', 'chicago']
The most informative terms for neg are: ['diva', 'ked', 'lol', 'heart', 'quizilla', 'realli', 'fall', 'urllink', 'hurt', 'i', 'never', 'fuck', 'ya', 'dun', 'lar', 'fun', 'sit', 'till', 'mayb', 'ppl']
