In [123]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import os
from glob import glob
import numpy as np
from sklearn.preprocessing import StandardScaler

In [26]:
pan = pd.read_excel('../../gender_prediction/data/pan17/pan17_df.xlsx')

In [27]:
pan.head()

Unnamed: 0,author,lang,text,gender,variety,care,fairness,loyalty,authority,purity,...,trust,surprise,positive,negative,sadness,disgust,joy,sentistrength_pos,sentistrength_neg,sentistrength_neutral
0,1d1df51fab9cd0510664b5d197f8400e,es,#PPKNoMeHagasHablar #Chinchero ni empieza y ya...,male,peru,5.091288e-18,7.344024e-24,2.257225e-35,1.0,9.186418e-11,...,0.074974,0.107709,0.140444,0.118268,0.077086,0.07603,0.111932,3,-3,-1
1,7be9d5588a2c05ce6b0195b64a93d156,es,Asi de simple como suena...por eso es que esta...,female,peru,1.812158e-11,8.199007e-21,4.548514e-33,1.0,0.5273741,...,0.066667,0.10303,0.151515,0.129293,0.078788,0.082828,0.09899,3,-2,1
2,a86b6d14d0e91836de9402d64815aa21,es,"La Real de Eusebio no entiende de remontadas, ...",male,spain,0.005874671,4.247725e-15,3.5435300000000004e-33,1.0,1.0,...,0.069318,0.107955,0.173864,0.123864,0.073864,0.068182,0.105682,2,-4,-1
3,b48c568e51506ffc49cf2f635f7502b3,ar,حادث سير مروع لحافلة مدرسة الحصاد الخاصة على ط...,male,levantine,6.146506e-08,0.0001089126,6.15257e-11,0.877201,1.937382e-07,...,0.1,0.075,0.125,0.1,0.1,0.1,0.125,3,-1,1
4,7631ab6eaf13f7554540efd9c21afd46,pt,Cadê esse sono pow 😭\nQueria só 4 horas de son...,female,brazil,0.9146294,0.8858467,1.217284e-08,0.999995,0.9999853,...,0.109948,0.078534,0.115183,0.104712,0.099476,0.08377,0.099476,2,-2,-1


In [28]:
train_cols = pan.columns[5:]
y = [0 if g=='male' else 1 for g in pan['gender']]
x_train, x_test, y_train, y_test = train_test_split(pan[train_cols], y, test_size=0.30, stratify=y, random_state=0)

In [47]:
train_cols

Index(['care', 'fairness', 'loyalty', 'authority', 'purity', 'non_moral',
       'fear', 'anger', 'anticip', 'trust', 'surprise', 'positive', 'negative',
       'sadness', 'disgust', 'joy', 'sentistrength_pos', 'sentistrength_neg',
       'sentistrength_neutral'],
      dtype='object')

### Modeling gender using sentiments with PAN17 twitter dataset
### Logistic Regression

In [30]:
logit = sm.Logit(y_train, x_train.astype(float))
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.672532
         Iterations 5


In [31]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 7980
Model:                          Logit   Df Residuals:                     7961
Method:                           MLE   Df Model:                           18
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 0.02968
Time:                        12:46:42   Log-Likelihood:                -5366.8
converged:                       True   LL-Null:                       -5531.0
Covariance Type:            nonrobust   LLR p-value:                 6.844e-59
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
care                      0.2686      0.114      2.351      0.019       0.045       0.492
fairness                  0.3800      0.100      3.794      0.000       0.184       0.576
loyalty 

In [32]:
clf = LogisticRegression(random_state=0, C=1e9).fit(x_train, y_train)
fitted_val = clf.predict(x_train)
print("evaluation on train set:")
print("accuracy ", accuracy_score(y_train, fitted_val))
print(classification_report(y_train, fitted_val))
pred = clf.predict(x_test)
print("evaluation on test set:")
print("accuracy ", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

evaluation on train set:
accuracy  0.5903508771929824
              precision    recall  f1-score   support

           0       0.59      0.61      0.60      4026
           1       0.59      0.57      0.58      3954

   micro avg       0.59      0.59      0.59      7980
   macro avg       0.59      0.59      0.59      7980
weighted avg       0.59      0.59      0.59      7980

evaluation on test set:
accuracy  0.577485380116959
              precision    recall  f1-score   support

           0       0.58      0.61      0.59      1726
           1       0.58      0.54      0.56      1694

   micro avg       0.58      0.58      0.58      3420
   macro avg       0.58      0.58      0.58      3420
weighted avg       0.58      0.58      0.58      3420





### Logistic Regression with L2 penalty

In [33]:
start_time = time.time()
tuned_parameters = [{'penalty': ['l2', 'l1'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}]

logistic_model = GridSearchCV(LogisticRegression(random_state=0, multi_class='ovr', tol=1e-5, max_iter = 8000),
                              tuned_parameters, cv=10, scoring='f1', n_jobs=-1)
logistic_model.fit(x_train, y_train)
print("finished tuning penalized LR in ", time.time()-start_time)



finished tuning penalized LR in  18.871705055236816


In [34]:
print(logistic_model.best_params_)
fitted_val = logistic_model.predict(x_train)
print("evaluation on train set:")
print("accuracy ", accuracy_score(y_train, fitted_val))
print(classification_report(y_train, fitted_val))
pred = logistic_model.predict(x_test)
print("evaluation on test set:")
print("accuracy ", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

{'C': 1000, 'penalty': 'l1'}
evaluation on train set:
accuracy  0.5902255639097744
              precision    recall  f1-score   support

           0       0.59      0.61      0.60      4026
           1       0.59      0.57      0.58      3954

   micro avg       0.59      0.59      0.59      7980
   macro avg       0.59      0.59      0.59      7980
weighted avg       0.59      0.59      0.59      7980

evaluation on test set:
accuracy  0.577485380116959
              precision    recall  f1-score   support

           0       0.58      0.61      0.59      1726
           1       0.58      0.54      0.56      1694

   micro avg       0.58      0.58      0.58      3420
   macro avg       0.58      0.58      0.58      3420
weighted avg       0.58      0.58      0.58      3420



### SVM

In [11]:
start_time = time.time()
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]

svc_model = GridSearchCV(SVC(random_state=0, tol=1e-5), tuned_parameters, cv=5, scoring='f1')
svc_model.fit(x_train, y_train)
print("finished tuning SVC in ", time.time()-start_time)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


finished tuning SVC in  210.1046531200409


In [12]:
svc_model.best_params_

{'C': 100, 'kernel': 'linear'}

In [13]:
fitted_val = svc_model.predict(x_train)
print("evaluation on train set:")
print("accuracy ", accuracy_score(y_train, fitted_val))
print(classification_report(y_train, fitted_val))
pred = svc_model.predict(x_test)
print("evaluation on test set:")
print("accuracy ", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

evaluation on train set:
accuracy  0.5773182957393483
              precision    recall  f1-score   support

           0       0.58      0.57      0.58      4026
           1       0.57      0.59      0.58      3954

   micro avg       0.58      0.58      0.58      7980
   macro avg       0.58      0.58      0.58      7980
weighted avg       0.58      0.58      0.58      7980

evaluation on test set:
accuracy  0.5616959064327486
              precision    recall  f1-score   support

           0       0.57      0.56      0.56      1726
           1       0.56      0.56      0.56      1694

   micro avg       0.56      0.56      0.56      3420
   macro avg       0.56      0.56      0.56      3420
weighted avg       0.56      0.56      0.56      3420



### XGBoost

In [35]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1],
        'reg_alpha': [2],
        'reg_lambda': [4],
        'gamma': [0.5],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6],
        'max_depth': [3, 5],
        'n_estimators': [1000]
        }
xgb = XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=-1)

In [36]:
xgb_model = GridSearchCV(estimator=xgb, param_grid=params, scoring='f1', n_jobs=-1, cv=5, verbose=1)
start_time = time.time()
xgb_model.fit(x_train, y_train)
print("finished tuning XGBoost in ", time.time()-start_time)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.2min finished


finished tuning XGBoost in  80.93284010887146


In [37]:
xgb_model.best_params_

{'colsample_bytree': 0.6,
 'gamma': 0.5,
 'max_depth': 3,
 'min_child_weight': 1,
 'n_estimators': 1000,
 'reg_alpha': 2,
 'reg_lambda': 4,
 'subsample': 0.6}

In [38]:
fitted_val = xgb_model.predict(x_train)
print("evaluation on train set:")
print("accuracy ", accuracy_score(y_train, fitted_val))
print(classification_report(y_train, fitted_val))
pred = xgb_model.predict(x_test)
print("evaluation on test set:")
print("accuracy ", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

evaluation on train set:
accuracy  0.6988721804511279
              precision    recall  f1-score   support

           0       0.69      0.73      0.71      4026
           1       0.71      0.67      0.69      3954

   micro avg       0.70      0.70      0.70      7980
   macro avg       0.70      0.70      0.70      7980
weighted avg       0.70      0.70      0.70      7980

evaluation on test set:
accuracy  0.6274853801169591
              precision    recall  f1-score   support

           0       0.62      0.66      0.64      1726
           1       0.63      0.59      0.61      1694

   micro avg       0.63      0.63      0.63      3420
   macro avg       0.63      0.63      0.63      3420
weighted avg       0.63      0.63      0.63      3420



In [44]:
from sklearn.externals import joblib
joblib.dump(xgb_model.best_estimator_, 'xgb_model.pkl')
model = joblib.load('xgb_model.pkl')
print(type(model))

<class 'xgboost.sklearn.XGBClassifier'>


### Calculate confounding variable

In [141]:
def load_comment(path):
    df = pd.read_csv(path, low_memory=False)
    df['full_text'] = df['body'].astype(str).fillna('')
    return df

def load_post(path):
    df = pd.read_csv(path, low_memory=False)
    df["full_text"] = df["title"].str.cat(df["text"], sep=' ').fillna('')
    return df

def get_confounding_var(model, sl='Business'):
    reddit_dir = '../../rawdata/'
    comment_list = [y for x in os.walk(reddit_dir) for y in glob(os.path.join(x[0], '*.csv')) if 'comment' in y]
    post_list = [y for x in os.walk(reddit_dir) for y in glob(os.path.join(x[0], '*.csv')) if 'comment' not in y]

    subreddit_list = ['Business', 'entrepreneur', 'EntrepreneurRideAlong', 'Entrepreneurship',
                      'Growmybusiness', 'Ladybusiness', 'Smallbusiness', 'Startup_ideas', 'Startups',
                      'Venturecapital']
    col_list = ['fear', 'anger',
       'anticipation', 'trust', 'surprise', 'positive', 'negative', 'sadness',
       'disgust', 'joy', 'care', 'fairness', 'loyalty', 'authority', 'purity',
       'non_moral', 'sentistrength_pos', 'sentistrength_neg', 'sentistrength_neutral']
    train_cols=pd.Index(['care', 'fairness', 'loyalty', 'authority', 'purity', 'non_moral',
    'fear', 'anger', 'anticip', 'trust', 'surprise', 'positive', 'negative',
    'sadness', 'disgust', 'joy', 'sentistrength_pos', 'sentistrength_neg',
    'sentistrength_neutral'])
    slist = [r for r in comment_list if sl in r]
    comment = pd.DataFrame()
    for s in slist:
        df = load_comment(s)
        df = df[col_list + ['predicted_gender', 'textblob_wordcount', 'parent_id', 'id']]
        comment = pd.concat([comment, df], ignore_index=True, sort=False)
    comment.rename(columns={"anticipation": "anticip"}, inplace=True)
    plist = [r for r in post_list if sl in r]
    post = pd.DataFrame()
    for p in plist:
        df = load_post(p)
        post = pd.concat([post, df], ignore_index=True, sort=False)
    merged = pd.merge(comment, post[list(train_cols)+['name', 'predicted_gender']],
             how='inner', left_on='parent_id', right_on='name',
             suffixes=('_comment', '_post'))
#     clist = [c.split('_post')[0] for c in train_cols + ['_post']]
    post_sentiments = merged[train_cols+['_post']]
    post_sentiments.columns = [c.split('_post')[0] for c in train_cols + ['_post']]
    merged['confounding_var'] = [p[0] for p in model.predict_proba(post_sentiments)]
    merged['gender_comment'] = [0 if g=='male' else 1 for g in merged['predicted_gender_comment']]
    merged['gender_post'] = [0 if g=='male' else 1 for g in merged['predicted_gender_post']]
    merged.drop_duplicates(inplace=True)
    return merged

In [92]:
merged = get_confounding_var(model, sl='Business')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [94]:
merged.head()

Unnamed: 0,fear_comment,anger_comment,anticip_comment,trust_comment,surprise_comment,positive_comment,negative_comment,sadness_comment,disgust_comment,joy_comment,...,disgust_post,joy_post,sentistrength_pos_post,sentistrength_neg_post,sentistrength_neutral_post,name,predicted_gender_post,confounding_var,gender_comment,gender_post
0,0.045455,0.090909,0.090909,0.136364,0.045455,0.227273,0.090909,0.090909,0.090909,0.090909,...,0.097561,0.097561,1,-1,0,t3_e8oi2a,male,0.378897,0,0
3,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.097561,0.097561,1,-1,0,t3_e8oi2a,male,0.378897,0,0
6,0.102041,0.102041,0.102041,0.102041,0.102041,0.122449,0.102041,0.081633,0.081633,0.102041,...,0.097561,0.097561,1,-1,0,t3_e8oi2a,male,0.378897,0,0
9,0.127273,0.090909,0.090909,0.109091,0.072727,0.090909,0.127273,0.109091,0.090909,0.090909,...,0.097561,0.097561,1,-1,0,t3_e8oi2a,male,0.378897,0,0
12,0.090909,0.090909,0.103896,0.090909,0.090909,0.116883,0.12987,0.103896,0.090909,0.090909,...,0.097561,0.097561,1,-1,0,t3_e8oi2a,male,0.378897,0,0


### Modeling comment gender using post gender and post/comment sentiment scores

### Logistic Regression with L1 penalty

In [142]:
def fit_logistic(subreddit = 'Business', save_merged=False):
    merged = get_confounding_var(model, sl=subreddit)
    col_list = pd.Index(['fear', 'anger',
       'anticip', 'trust', 'surprise', 'positive', 'negative', 'sadness',
       'disgust', 'joy', 'care', 'fairness', 'loyalty', 'authority', 'purity',
       'non_moral', 'sentistrength_pos', 'sentistrength_neg', 'sentistrength_neutral'])
    train_cols = list(col_list+'_post') + list(col_list+'_comment') + ['gender_post']
    
    x = merged[train_cols].astype(float)
    y = merged['gender_comment']
    x = (x-x.mean())/x.std()
    
    logit = sm.Logit(y, x, missing='drop')
    result = logit.fit_regularized(alpha=100, method='l1', maxiter=200)
    if save_merged:
        merged.to_csv('../output/merged_'+subreddit+'.csv', index=False, index_label=False)
    print(result.summary())
    return result

In [143]:
subreddit_list = ['Business', 'entrepreneur', 'EntrepreneurRideAlong', 'Entrepreneurship',
                  'Growmybusiness', 'Ladybusiness', 'Smallbusiness', 'Startup_ideas', 'Startups',
                  'Venturecapital']
for s in subreddit_list:
    print(s)
    result=fit_logistic(s)

Business
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6864973960642509
            Iterations: 66
            Function evaluations: 66
            Gradient evaluations: 66
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                34962
Model:                          Logit   Df Residuals:                    34944
Method:                           MLE   Df Model:                           17
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 -0.9062
Time:                        15:14:34   Log-Likelihood:                -23918.
converged:                       True   LL-Null:                       -12548.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6690223297198438
            Iterations: 79
            Function evaluations: 79
            Gradient evaluations: 79
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                76141
Model:                          Logit   Df Residuals:                    76119
Method:                           MLE   Df Model:                           21
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 -0.3239
Time:                        15:14:45   Log-Likelihood:                -50718.
converged:                       True   LL-Null:                       -38311.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6931471805599452
            Iterations: 1
            Function evaluations: 1
            Gradient evaluations: 1
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                 2338
Model:                          Logit   Df Residuals:                     2338
Method:                           MLE   Df Model:                           -1
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 -0.4446
Time:                        15:14:49   Log-Likelihood:                -1620.6
converged:                       True   LL-Null:                       -1121.8
Covariance Type:            nonrobust   LLR p-value:                       nan
                                    coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6899627839709466
            Iterations: 15
            Function evaluations: 15
            Gradient evaluations: 15
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                 1739
Model:                          Logit   Df Residuals:                     1736
Method:                           MLE   Df Model:                            2
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 0.02041
Time:                        15:14:53   Log-Likelihood:                -1178.6
converged:                       True   LL-Null:                       -1203.1
Covariance Type:            nonrobust   LLR p-value:                 2.174e-11
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6918465239885396
            Iterations: 34
            Function evaluations: 34
            Gradient evaluations: 34
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                 6252
Model:                          Logit   Df Residuals:                     6245
Method:                           MLE   Df Model:                            6
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 -0.9055
Time:                        15:15:02   Log-Likelihood:                -4309.3
converged:                       True   LL-Null:                       -2261.5
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6931469070711562
            Iterations: 4
            Function evaluations: 4
            Gradient evaluations: 4
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                 2005
Model:                          Logit   Df Residuals:                     2004
Method:                           MLE   Df Model:                            0
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                  -1.073
Time:                        15:15:11   Log-Likelihood:                -1389.6
converged:                       True   LL-Null:                       -670.27
Covariance Type:            nonrobust   LLR p-value:                       nan
                                    coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------

In [136]:
result = fit_logistic('Ladybusiness', save_merged=True)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6899627839709466
            Iterations: 15
            Function evaluations: 15
            Gradient evaluations: 15
                           Logit Regression Results                           
Dep. Variable:         gender_comment   No. Observations:                 1739
Model:                          Logit   Df Residuals:                     1736
Method:                           MLE   Df Model:                            2
Date:                Tue, 21 Jul 2020   Pseudo R-squ.:                 0.02041
Time:                        14:54:59   Log-Likelihood:                -1178.6
converged:                       True   LL-Null:                       -1203.1
Covariance Type:            nonrobust   LLR p-value:                 2.174e-11
                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [137]:
np.diag(result.cov_params())

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.00259517,
       0.00255768,        nan,        nan, 0.00234267])

In [1]:
# # remove columns with low variance to ensure convergence
# std_list = mediation.describe().loc['std',:][1:]
# high_val_cols = list(std_list[std_list>0.1].index.values)
# logit = sm.Logit(mediation['predicted_gender'], mediation[high_val_cols].astype(float))
# result = logit.fit(method='bfgs', maxiter=70)
# print(result.summary())