### Twitter
from http://help.sentiment140.com/for-students/

The data is a CSV with emoticons removed. Data file format has 6 fields:  
0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)  
1 - the id of the tweet (2087)  
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)  
3 - the query (lyx). If there is no query, then this value is NO_QUERY.  
4 - the user that tweeted (robotickilldozr)  
5 - the text of the tweet (Lyx is cool)  


In [1]:
import sys, os, re, pickle, xlsxwriter
# sys.path.append("/home/jent_so/LM_GenderBias")
# sys.path.append("/home/jent_so/LM_GenderBias/terms")

import pandas as pd
import data_masking as masking
import numpy as np

import logging
# create logger
logger_twitter_prep = logging.getLogger('twitter_prep')
logger_twitter_prep.setLevel(logging.INFO)

INFO:root:imported term dicts: prons_m2f, prons_f2m, terms_m2f, terms_f2m; and sets; all_prons and all_terms
INFO:root:successfully imported the latest version of data_masking.


In [2]:
def clean_text(reviews):
    reviews = [re.sub('@[^\s]+','', line) for line in reviews]
    REPLACE_NO_SPACE = re.compile("[.;:!?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(\')")
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

### Split into test and training set
(!) Only once in the initialisation to keep the sets constant

In [8]:
df_train = pd.read_pickle('Twitter_training/Twitter_train')
df_test = pd.read_pickle('Twitter_training/Twitter_test')
df_train

df_train_ = pd.read_pickle("Twitter_l_train")
df_test_ = pd.read_pickle("Twitter_l_test")

In [20]:
df_train.to_pickle('Twitter_training/Twitter_original_train')
df_test.to_pickle('Twitter_training/Twitter_original_test')

In [9]:
df_train

Unnamed: 0,ID,text,label
524045,train_0_2193340125,i really wanna see you in glasgow tomorrow bu...,0
1503517,train_4_2071990209,getting ready to hit the sack 315 comes around...,4
675911,train_0_2248238260,i realized tomorrow is my last dance recital e...,0
1175180,train_4_1981058341,at the salon doing make up for a shoot at tod...,4
639764,train_0_2234803784,school photos chemistry retake not a good day,0
...,...,...,...
1190129,train_4_1983659245,made it to la everyone look for me at the movi...,4
1565965,train_4_2187664366,it s going to be good for sure and the new ...,4
1598193,train_4_2193121036,expand please,4
1020558,train_4_1882388002,#followfriday an awesome mom friend and all a...,4


In [10]:
df_test

Unnamed: 0,ID,text,label
1390936,test_4_2053270644,its a boring day and i need a boring movie to ...,4
1314870,test_4_2013961964,thank god they brought back the deep voice guy...,4
606001,test_0_2222443228,sad news the spice flow came to an end http ...,0
101957,test_0_1794727393,im usually a fighter but its kind of hard to f...,0
656613,test_0_2240575304,my brother and i miss you over at facebook,0
...,...,...,...
912730,test_4_1752384224,round 2,4
675598,test_0_2248143215,red lobster makes me so sick to my stomach ev...,0
121880,test_0_1833604010,i m freaking tired i m nervous about tomorrow ...,0
1453206,test_4_2063209643,i couldnt agree with you more,4


---  
---  
---  


### Step 1: Gender neutral data sets for training

- condition 1: **remove** gender terms - `_N`
- condition 2: **replace** and use both - `_M + _F`  

- three different dicts: (1) all = big dict terms (2) weat = dict only contains WEAT target terms (3) prons = dict only contains pronouns as target terms  
`text_all_M` | `text_all_F` | `text_all_N`  
`text_weat_M` | `text_weat_F` | `text_weat_N`  
`text_pron_M` | `text_pron_F` | `text_pron_N`

In [11]:
# Mask all terms in Data 

df_train_ = df_train.copy()
df_test_ = df_test.copy()

masking.make_all_df(df_train_)
print(df_train_.head(2))

masking.make_all_df(df_test_)
print(df_test_.head(2))

INFO:root: make_all_df: finish counts and length
INFO:root: make_all_df: finish text_all_M
INFO:root: make_all_df: finish text_all_F
INFO:root: make_all_df: finish text_all_N
INFO:root: make_all_df: finish text_weat_M
INFO:root: make_all_df: finish text_weat_F
INFO:root: make_all_df: finish text_weat_N
INFO:root: make_all_df: finish text_pro_M
INFO:root: make_all_df: finish text_pro_F
INFO:root: make_all_df: finish text_pro_N


                         ID  \
524045   train_0_2193340125   
1503517  train_4_2071990209   

                                                      text  label  \
524045    i really wanna see you in glasgow tomorrow bu...      0   
1503517  getting ready to hit the sack 315 comes around...      4   

                                               count_table  count_total  \
524045   {'bondman': 0, 'stewardess': 0, 'girlhood': 0,...            0   
1503517  {'bondman': 0, 'stewardess': 0, 'girlhood': 0,...            0   

                                          count_table_weat  count_weat  \
524045   {'men': 0, 'girl': 0, 'grandmother': 0, 'femin...           0   
1503517  {'men': 0, 'girl': 0, 'grandmother': 0, 'femin...           0   

         count_prons  len                                         text_all_M  \
524045             0   14   i really wanna see you in glasgow tomorrow bu...   
1503517            0   12  getting ready to hit the sack 315 comes around...   

        

INFO:root: make_all_df: finish counts and length
INFO:root: make_all_df: finish text_all_M
INFO:root: make_all_df: finish text_all_F
INFO:root: make_all_df: finish text_all_N
INFO:root: make_all_df: finish text_weat_M
INFO:root: make_all_df: finish text_weat_F
INFO:root: make_all_df: finish text_weat_N
INFO:root: make_all_df: finish text_pro_M
INFO:root: make_all_df: finish text_pro_F
INFO:root: make_all_df: finish text_pro_N


                        ID                                               text  \
1390936  test_4_2053270644  its a boring day and i need a boring movie to ...   
1314870  test_4_2013961964  thank god they brought back the deep voice guy...   

         label                                        count_table  \
1390936      4  {'bondman': 0, 'stewardess': 0, 'girlhood': 0,...   
1314870      4  {'bondman': 0, 'stewardess': 0, 'girlhood': 0,...   

         count_total                                   count_table_weat  \
1390936            0  {'men': 0, 'girl': 0, 'grandmother': 0, 'femin...   
1314870            2  {'men': 0, 'girl': 0, 'grandmother': 0, 'femin...   

         count_weat  count_prons  len  \
1390936           0            0   26   
1314870           0            0   15   

                                                text_all_M  \
1390936  its a boring day and i need a boring movie to ...   
1314870  thank god they brought back the deep voice guy...   

           

In [12]:
masking.check_df(df_test_)
masking.check_df(df_train_)

print(df_train_.shape)
print(df_test_.shape)

# Safe whole table (large)
df_train_.to_pickle("Twitter_l_train")
df_test_.to_pickle("Twitter_l_test")

INFO:root:all tests ok
INFO:root:all tests ok


(800000, 18)
(800000, 18)


In [13]:
df_train.head(2)

Unnamed: 0,ID,text,label
524045,train_0_2193340125,i really wanna see you in glasgow tomorrow bu...,0
1503517,train_4_2071990209,getting ready to hit the sack 315 comes around...,4


In [14]:
df_train_.head(10)

Unnamed: 0,ID,text,label,count_table,count_total,count_table_weat,count_weat,count_prons,len,text_all_M,text_all_F,text_all_N,text_weat_M,text_weat_F,text_weat_N,text_pro_M,text_pro_F,text_pro_N
524045,train_0_2193340125,i really wanna see you in glasgow tomorrow bu...,0,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,14,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...,i really wanna see you in glasgow tomorrow bu...
1503517,train_4_2071990209,getting ready to hit the sack 315 comes around...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,12,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...,getting ready to hit the sack 315 comes around...
675911,train_0_2248238260,i realized tomorrow is my last dance recital e...,0,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,12,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...,i realized tomorrow is my last dance recital e...
1175180,train_4_1981058341,at the salon doing make up for a shoot at tod...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,11,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...,at the salon doing make up for a shoot at tod...
639764,train_0_2234803784,school photos chemistry retake not a good day,0,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,8,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day,school photos chemistry retake not a good day
1494174,train_4_2069640838,just go to your bank and file a unauthorized ...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,17,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...,just go to your bank and file a unauthorized ...
958640,train_4_1825796772,i know you ve got a link that will explain wh...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,28,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...,i know you ve got a link that will explain wh...
1497279,train_4_2070346069,been listening to your cd a lot while chillin...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,17,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...,been listening to your cd a lot while chillin...
1496267,train_4_2070090243,headin 4 de bedactually i m in de bedbut mah e...,4,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",1,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,31,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...,headin 4 de bedactually i m in de bedbut mah e...
547452,train_0_2202079302,i watched jada s new show hawthorn i wasn t im...,0,"{'bondman': 0, 'stewardess': 0, 'girlhood': 0,...",0,"{'men': 0, 'girl': 0, 'grandmother': 0, 'femin...",0,0,17,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...,i watched jada s new show hawthorn i wasn t im...


In [15]:
# We see that 7/8 of the samples has no term included

print(df_train_[df_train_['count_total']> 0].shape) 
print(df_train_.shape)

(108800, 18)
(800000, 18)


## From the large table, we now produce specific test and  training sets 

Safe training and test dataframes for different training conditions.  
neutral and mixed 

In [16]:
# neutral
for spec in ['_all', '_pro', '_weat']:
    df_train_[['ID', 'text'+spec+'_N', 'label']].to_pickle('Twitter_training/Twitter_N'+spec+'_train')
    df_test_[['ID', 'text'+spec+'_N', 'label']].to_pickle('Twitter_training/Twitter_N'+spec+'_test')

# mixed M+F
for spec in ['_all', '_pro', '_weat']: 
    m_tr = df_train_[['ID', 'text'+spec+'_M', 'label']].rename(columns={'text'+spec+'_M': 'text'})
    f_tr = df_train_[['ID', 'text'+spec+'_F', 'label']].rename(columns={'text'+spec+'_F': 'text'})
    tr = m_tr.append(f_tr)
    tr.to_pickle('Twitter_training/Twitter_mix' + spec + '_train') 
    
    m_te = df_test_[['ID', 'text'+spec+'_M', 'label']].rename(columns={'text'+spec+'_M': 'text'})
    f_te = df_test_[['ID', 'text'+spec+'_F', 'label']].rename(columns={'text'+spec+'_F': 'text'})
    te = m_te.append(f_te)
    te.to_pickle('Twitter_training/Twitter_mix' + spec + '_test') 
    
    print(tr.shape, te.shape)    

(1600000, 3) (1600000, 3)
(1600000, 3) (1600000, 3)
(1600000, 3) (1600000, 3)


### Create Data Sets with no only samples that do not contain any term of the dict
dicts are again pron, weat, alll

In [18]:
# no term sample

df_train_no_pron = df_train_[df_train_['count_total'] == 0][['ID', 'text', 'label']]
print(df_train_no_pron.shape)
df_test_no_pron = df_test_[df_test_['count_total'] == 0][['ID', 'text', 'label']]
print(df_test_no_pron.shape)

df_train_no_weat = df_train_[df_train_['count_weat'] == 0][['ID', 'text', 'label']]
print(df_train_no_weat.shape)
df_test_no_weat = df_test_[df_test_['count_weat'] == 0][['ID', 'text', 'label']]
print(df_test_no_weat.shape)

df_train_no_all = df_train_[df_train_['count_prons'] == 0][['ID', 'text', 'label']]
print(df_train_no_all.shape)
df_test_no_all = df_test_[df_test_['count_prons'] == 0][['ID', 'text', 'label']]
print(df_test_no_all.shape)


df_train_no_pron.to_pickle('Twitter_training/Twitter_no_pron_train')
df_test_no_pron.to_pickle('Twitter_training/Twitter_no_pron_test')
df_train_no_weat.to_pickle('Twitter_training/Twitter_no_weat_train')
df_test_no_weat.to_pickle('Twitter_training/Twitter_no_weat_test')
df_train_no_all.to_pickle('Twitter_training/Twitter_no_all_train')
df_test_no_all.to_pickle('Twitter_training/Twitter_no_all_test')

(691200, 3)
(691347, 3)
(725866, 3)
(725551, 3)
(750334, 3)
(750541, 3)


### Create Data Sets with no only samples that do contain a minimal number of term of the dict
dicts are again pron, weat, all  
minimal number should most likely be 1 for IMDB

In [19]:
min_term_count = 1
df_train__ = df_train_.rename(columns={'count_total': 'count_all', 'count_prons': 'count_pro'})
df_test__ = df_test_.rename(columns={'count_total': 'count_all', 'count_prons': 'count_pro'})


for spec in ['_all', '_pro', '_weat']:
    
    df_train_MIN = df_train__[df_train__['count'+spec] >= min_term_count]
    df_test_MIN = df_test__[df_test__['count'+spec] >= min_term_count]
    
    # all
    df_train_MIN[['ID', 'text', 'label']].to_pickle('Twitter_training/Twitter_MIN' + spec + '_test')
    df_test_MIN[['ID', 'text', 'label']].to_pickle('Twitter_training/Twitter_MIN' + spec + '_train')
    
    # neutral M+F
    df_train_MIN[['ID', 'text'+spec+'_N', 'label']].to_pickle('Twitter_training/Twitter_MIN_N'+spec+'_train')
    df_test_MIN[['ID', 'text'+spec+'_N', 'label']].to_pickle('Twitter_training/Twitter_MIN_N'+spec+'_test')

    # mixed
    m_tr = df_train_MIN[['ID', 'text'+spec+'_M', 'label']].rename(columns={'text'+spec+'_M': 'text'})
    f_tr = df_train_MIN[['ID', 'text'+spec+'_F', 'label']].rename(columns={'text'+spec+'_F': 'text'})
    tr = m_tr.append(f_tr)
    tr.to_pickle('Twitter_training/Twitter_MIN_mix' + spec + '_train') 
    
    m_te = df_test_MIN[['ID', 'text'+spec+'_M', 'label']].rename(columns={'text'+spec+'_M': 'text'})
    f_te = df_test_MIN[['ID', 'text'+spec+'_F', 'label']].rename(columns={'text'+spec+'_F': 'text'})
    te = m_te.append(f_te)
    te.to_pickle('Twitter_training/Twitter_MIN_mix' + spec + '_test') 
    
    print(tr.shape, te.shape)    

(217600, 3) (217306, 3)
(99332, 3) (98918, 3)
(148268, 3) (148898, 3)
