In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,LeaveOneOut,KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import numpy as np
import pandas as pd
from google.colab import drive
from tqdm import tqdm_notebook, tqdm

drive.mount('/content/drive')
path = 'drive/My Drive/blogtext_preprocessed.csv'
blog_data = pd.read_csv(path).iloc[:10000,]

Mounted at /content/drive


In [2]:
def preprocessing(data):
  data.dropna(inplace=True)
  data.replace({'gender': {'male': 1,'female': 0}},inplace=True)
  temp_text = data['text'].to_list()
  classes = data['gender'].to_list()
  vectorizer = TfidfVectorizer(analyzer='word',token_pattern=r'\d*[a-zA-Z][a-zA-Z0-9]*')
  X = vectorizer.fit_transform(temp_text)
  vocab = vectorizer.get_feature_names()
  texts = X.toarray()
  return texts,classes,vocab

def evaluate(X,y,vocab,penalty="l2"):
  model = LogisticRegression(penalty=penalty,solver="liblinear")
  results = cross_validate(model,X,y,cv=KFold(n_splits=10, shuffle=True, random_state=1))
  
  scores = results["test_score"]
  avg_score = sum(scores)/len(scores)
  
  model.fit(X,y)
  class0_weight_sorted = model.coef_[0, :].argsort()
  class1_weight_sorted = (-model.coef_[0, :]).argsort()

  termsToTake = 20
  class0_indicators = [vocab[i] for i in class0_weight_sorted[:termsToTake]]
  class1_indicators = [vocab[i] for i in class1_weight_sorted[:termsToTake]]

  if model.classes_[0] == "pos":
    return avg_score,class0_indicators,class1_indicators
  else:
    return avg_score,class1_indicators,class0_indicators



In [4]:
X,y,vocab = preprocessing(blog_data)
avg_score,pos_indicators,neg_indicators = evaluate(X,y,vocab)
print("Accuracy:",avg_score)
print("Top features for male:",pos_indicators)
print("Top features for female:",neg_indicators)

Accuracy: 0.764718238074155
Top features for male: ['duf', 'dj', 'hal', 'ben', 'panda', 'stoner', 'angi', 'pandyland', 'korean', 'fer', 'agre', 'f', 'johnathan', 'album', 'hey', 'sinc', 'play', 'bush', 'but', 'chicago']
Top features for female: ['diva', 'ked', 'lol', 'heart', 'quizilla', 'realli', 'fall', 'urllink', 'hurt', 'i', 'never', 'fuck', 'ya', 'dun', 'lar', 'fun', 'sit', 'till', 'mayb', 'ppl']
