## Dictionary Details 

1. r["title"] tells you the noramlized title
2. r["gender"] tells you the gender (binary for simplicity, determined from the pronouns)3. 
3. r["start_pos"] indicates the length of the first sentence.
4. r["raw"] has the entire bio
5. The field r["bio"] contains a scrubbed version of the bio (with the person's name and obvious gender words (like she/he removed)


## Problem Statement 

So the classification task is to predict r["title"] from r["raw"][r["start_pos"]:]


In [9]:
import pandas as pd

train_data=pd.read_csv('Data/Train.csv')
val_data =pd.read_csv('Data/Val.csv')
test_data =pd.read_csv('Data/Test.csv')


In [12]:
train_data[train_data['gender']=='F']['raw'].iloc[1]


'Muriel Gillick is a physician specializing in the care of elderly patients and adults of all ages who are facing serious, life-threatening illness. She is a staff physician at Harvard Vanguard Medical Associates and a Professor of Population Medicine at Harvard Medical School/Harvard Pilgrim Health Care Institute. She provides consultation to patients at the Brigham and Women Hospital, a major Harvard teaching hospital.'

#### Example Dictionary Element

In [None]:
test_bio = all_bios[0]
test_bio['raw'][test_bio['start_pos']:]

In [None]:
test_bio['raw']

### Distribution of occupation

In [None]:
occupation_dict={}
for bio in all_bios:
    occupation=bio['title']
    try:
        occupation_dict[occupation] += 1
    except KeyError:
        occupation_dict[occupation] = 1


In [None]:
occupation_dict

In [None]:
import matplotlib.pyplot as plt
import numpy as np

occupation_dict={k: v for k, v in sorted(occupation_dict.items(), key=lambda item: item[1], reverse=True)}
keys = occupation_dict.keys()
vals = occupation_dict.values()

plt.figure(figsize=(30,10))

plt.bar(keys, list(vals), label="Real distribution")


plt.ylabel ('Count')
plt.yscale('log')
plt.xlabel ('Occupation')
plt.xticks(list(keys))
plt.legend (bbox_to_anchor=(1, 1), loc="upper right", borderaxespad=0.)
plt.tight_layout()
plt.savefig('occupation_distribution.png')

### Model 2 : Word_Embeddings

In [4]:
from __future__ import print_function
from gensim.models import KeyedVectors
fname='../../../../embeddings/wiki-news-300d-1M.vec'

# Creating the model
## Takes a lot of time depending on the vector file size 
en_model = KeyedVectors.load_word2vec_format(fname)

# # Getting the tokens 
# words = []
# for word in en_model.vocab:
#     words.append(word)

# # Printing out number of tokens available
# print("Number of Tokens: {}".format(len(words)))

# # Printing out the dimension of a word vector 
# print("Dimension of a word vector: {}".format(
#     len(en_model[words[0]])
# ))


In [52]:
import numpy as np
def get_average_embedding(sentence):
    tokens=sentence.lower().split()
    embedding = np.zeros([300,],dtype=np.float32)
    for tok in tokens:
        try:
            embedding+=en_model[tok]
        except KeyError:
            pass
    if(len(tokens)>0):
        return embedding/len(tokens)
    else:
        return embedding

In [53]:
em=get_average_embedding("The sun comes up")

In [62]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data['title'])

le.classes_
# array([1, 2, 6])
# >>> le.transform([1, 1, 2, 6])
# array([0, 0, 1, 2]...)
# >>> le.inverse_transform([0, 0, 1, 2])
# array([1, 1, 2, 6])

TypeError: argument must be a string or number

In [73]:
from tqdm import tqdm_notebook,tqdm
from sklearn import preprocessing


train_wo_g=[]
train_w_g=[]
val_wo_g=[]
val_w_g=[]
test_wo_g=[]
test_w_g=[]

encoder = preprocessing.LabelEncoder()
encoder.classes_ = np.load('classes.npy')

for index,row in tqdm(train_data.iterrows()):
    try:
        index_to_start=int(row['start_pos'])
    except:
        continue
    tuple1=[get_average_embedding(row['raw'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    tuple2=[get_average_embedding(row['bio'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    train_w_g.append(tuple1)
    train_wo_g.append(tuple2)


for index,row in tqdm(val_data.iterrows()):
    try:
        index_to_start=int(row['start_pos'])
    except:
        continue
    tuple1=[get_average_embedding(row['raw'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    tuple2=[get_average_embedding(row['bio'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    val_w_g.append(tuple1)
    val_wo_g.append(tuple2)

for index,row in tqdm(test_data.iterrows()):
    try:
        index_to_start=int(row['start_pos'])
    except:
        continue
    tuple1=[get_average_embedding(row['raw'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    tuple2=[get_average_embedding(row['bio'][index_to_start:]),encoder.transform([row['title']])[0],row['gender']]
    test_w_g.append(tuple1)
    test_wo_g.append(tuple2)

    



# for bio in tqdm(all_bios):
#     index_to_start=bio['start_pos']
#     tuple1=[get_average_embedding(bio['raw'][index_to_start:]),bio['title'],bio['gender']]
#     tuple2=[get_average_embedding(bio['bio'][index_to_start:]),bio['title'],bio['gender']]
#     data_with_g.append(tuple1)
#     data_without_g.append(tuple2)
    
    
    




0it [00:00, ?it/s][A[A[A


136it [00:00, 1358.99it/s][A[A[A


288it [00:00, 1399.69it/s][A[A[A


430it [00:00, 1395.94it/s][A[A[A


578it [00:00, 1416.37it/s][A[A[A


719it [00:00, 1412.93it/s][A[A[A


873it [00:00, 1447.63it/s][A[A[A


1040it [00:00, 1507.49it/s][A[A[A


1200it [00:00, 1532.57it/s][A[A[A


1351it [00:00, 1522.37it/s][A[A[A


1499it [00:01, 1486.71it/s][A[A[A


1656it [00:01, 1507.33it/s][A[A[A


1807it [00:01, 1505.17it/s][A[A[A


1969it [00:01, 1534.39it/s][A[A[A


2122it [00:01, 1489.97it/s][A[A[A


2271it [00:01, 1455.80it/s][A[A[A


2417it [00:01, 1391.84it/s][A[A[A


2566it [00:01, 1417.14it/s][A[A[A


2709it [00:01, 1326.38it/s][A[A[A


2844it [00:01, 1330.71it/s][A[A[A


3012it [00:02, 1418.53it/s][A[A[A


3178it [00:02, 1482.72it/s][A[A[A


3347it [00:02, 1536.01it/s][A[A[A


3503it [00:02, 1534.02it/s][A[A[A


3658it [00:02, 1535.32it/s][A[A[A


3825it [00:02, 1570.56it/s][A[A[A


3104it [00:02, 1508.19it/s][A[A[A


3262it [00:02, 1528.75it/s][A[A[A


3417it [00:02, 1533.39it/s][A[A[A


3572it [00:02, 1506.37it/s][A[A[A


3726it [00:02, 1512.36it/s][A[A[A


3878it [00:02, 1503.74it/s][A[A[A


4037it [00:02, 1528.53it/s][A[A[A


4202it [00:02, 1561.05it/s][A[A[A


4361it [00:02, 1566.84it/s][A[A[A


4531it [00:03, 1603.70it/s][A[A[A


4702it [00:03, 1630.91it/s][A[A[A


4875it [00:03, 1658.98it/s][A[A[A


5042it [00:03, 1652.58it/s][A[A[A


5220it [00:03, 1688.67it/s][A[A[A


5390it [00:03, 1658.38it/s][A[A[A


5557it [00:03, 1607.22it/s][A[A[A


5734it [00:03, 1651.95it/s][A[A[A


5900it [00:03, 1635.27it/s][A[A[A


6065it [00:03, 1635.18it/s][A[A[A


6229it [00:04, 1615.51it/s][A[A[A


6391it [00:04, 1587.75it/s][A[A[A


6551it [00:04, 1522.84it/s][A[A[A


6718it [00:04, 1563.47it/s][A[A[A


6893it [00:04, 1614.09it/s][A[A[A


7065it [00:04, 1643.57it/s][A[A[A


7231it [00:04, 1644.35it/

### Model with gender present 


In [77]:

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier


train_x= np.array([list(ele[0]) for ele in train_w_g])
train_y=np.array([ele[1] for ele in train_w_g])
val_x=np.array([list(ele[0]) for ele in val_w_g])
val_y=np.array([ele[1] for ele in val_w_g])
test_x=np.array([list(ele[0]) for ele in test_w_g])
test_y=np.array([ele[1] for ele in test_w_g])

model

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [82]:
model=OneVsRestClassifier(LogisticRegression(verbose=1,n_jobs=10))
model.fit(train_x,train_y)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   22.3s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   19.7s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   27.4s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   17.9s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   16.8s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   27.6s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=10,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=1, warm_start=False),
                    n_jobs=None)

In [95]:
pred_x=model.predict(test_x)
pred_decoded=encoder.inverse_transform(list(pred_x))
gt_decoded=encoder.inverse_transform(list(test_y))

tuples_test=[[ele[2]] for ele in test_w_g]

df=pd.DataFrame(tuples_test,columns=['gender'])
df['pred_title']=pred_decoded
df['title']=gt_decoded

In [96]:
df

Unnamed: 0,gender,pred_title,title
0,M,professor,professor
1,M,professor,architect
2,F,teacher,teacher
3,M,composer,composer
4,M,surgeon,surgeon
...,...,...,...
20836,F,physician,physician
20837,F,attorney,attorney
20838,M,attorney,attorney
20839,M,teacher,teacher


In [97]:
df.to_csv('WE_LR_with_gender.csv',index=False)

In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier


train_x= np.array([list(ele[0]) for ele in train_wo_g])
train_y=np.array([ele[1] for ele in train_wo_g])
val_x=np.array([list(ele[0]) for ele in val_wo_g])
val_y=np.array([ele[1] for ele in val_wo_g])
test_x=np.array([list(ele[0]) for ele in test_wo_g])
test_y=np.array([ele[1] for ele in test_wo_g])

model=OneVsRestClassifier(LogisticRegression(verbose=1,n_jobs=10))
model.fit(train_x,train_y)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:  4.7min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:  2.1min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:  2.4min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   42.9s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:  1.2min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:  1.4min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=10,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=1, warm_start=False),
                    n_jobs=None)

In [100]:
pred_x=model.predict(test_x)
pred_decoded=encoder.inverse_transform(list(pred_x))
gt_decoded=encoder.inverse_transform(list(test_y))

tuples_test=[[ele[2]] for ele in test_wo_g]

df=pd.DataFrame(tuples_test,columns=['gender'])
df['pred_title']=pred_decoded
df['title']=gt_decoded

In [101]:
df.to_csv('WE_LR_without_gender.csv',index=False)

In [72]:
encoder.transform(['dj'])[0]

8