# Authorship Profiling

### Name: Ruchira Shidhaye


In [1]:
## Installing libraries if not present already:

#!pip install nltk
#!pip install sklearn

In [2]:
## Import the necessary libraries:

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET                                      ## Extracting the data from xml files

In [3]:

from nltk.corpus import stopwords
from nltk import word_tokenize    
from nltk.tokenize import RegexpTokenizer
from nltk.probability import *
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import  BaggingClassifier

In [4]:
## Read the train_labels and test files into dataframes using pandas:

df=pd.read_csv('train_labels.csv')
df_test=pd.read_csv('test.csv')

In [5]:
## Declare an empty list for storing all the content of our xml files belonging to training set one by one:

text_train=[]

## A loop which iterates through the train_labels dataframe and extracts the data:

for i in df['id']:
    name=i+'.xml'
    name='data/'+name
    tree = ET.parse(name)
    my_text = [item.text for item in tree.iter()]
    text_train.append(my_text)

In [6]:
## Declare an empty list for storing all the content of our xml files belonging to test set one by one:

text_test=[]

## A loop which iterates through the test_labels dataframe and extracts the data:

for i in df_test['id']:
    name=i+'.xml'
    name='data/'+name
    tree = ET.parse(name)
    my_text = [item.text for item in tree.iter()]
    text_test.append(my_text)

In [7]:
## Assigning to different variables:

trainDocs=text_train
testDocs=text_test

In [8]:
## Defining the lemmatizer which will convert the words into their dictionary form:

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl=WordNetLemmatizer()
    def __call__(self,doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

### Pre-processing steps:

- Following are the pre-processing steps which are performed on the text data extracted from xml files in an order:
    
    1. Removing **https** tags, mentions of another person or account that start with **@**, **#** symbols but the words associated with them are retained.
    2. Removing the **emojis** or **emoticons** from the text.
    3. Converting all words to lowercase and then tokenizing them using **RegexpTokenizer**.
    4. Removing **stopwords** from the tokens.
    5. Removing numbers but keeping alphanumeric words.
    6. Removing single character words.

In [9]:
## Joining the data in text into a string:

trainDocs = [" ".join(x) for x in text_train]
testDocs = [" ".join(x) for x in text_test]


In [10]:
## Function to remove emojis:

def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [11]:
## Creating a list of stopwords belonging to english language:

stoplist = set(stopwords.words("english"))

In [12]:
## Pre-processing of training texts:

## A tokenizer that splits a string using a regular expression, which matches the tokens as gaps is False:

tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?", gaps=False)  

## Iterating through training set:

for i in range(len(trainDocs)):
    trainDocs[i]=re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',trainDocs[i])   ## Removing the https tags
    trainDocs[i]=re.sub('@[^\s]+','',trainDocs[i])                             ## Removing the mentions of people
    trainDocs[i]=re.sub(r'#([^\s]+)', r'\1',trainDocs[i])                      ## Removing the # symbol but keeping the word attached with it
    trainDocs[i]=deEmojify(trainDocs[i])                                       ## Removing the emojis
    trainDocs[i] = trainDocs[i].lower()                                        ## Converting to lowercase
    trainDocs[i] = tokenizer.tokenize(trainDocs[i])                            ## Split into words
    nst=[]                                                                     ## A list to hold words not in stopwords list
    for j in trainDocs[i]:
        if j not in stoplist:
            nst.append(j)
    trainDocs[i]=nst                                                           ## Finally appending clean tokens
            


In [13]:
## Remove numbers, but not words that contain numbers:

trainDocs = [[token for token in doc if not token.isnumeric()] for doc in trainDocs]

## Remove words that are only one character:

trainDocs = [[token for token in doc if len(token) > 1] for doc in trainDocs]

In [14]:
## Pre-processing of testing texts:

## A tokenizer that splits a string using a regular expression, which matches the tokens as gaps is False:

tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?", gaps=False)  

## Iterating through testing set:

for i in range(len(testDocs)):
    testDocs[i]=re.sub('(www|http:|https:)+[^\s]+[\w]','',testDocs[i])        ## Removing the https tags
    testDocs[i]=re.sub('@.*?\s+','',testDocs[i])                              ## Removing the mentions of people
    testDocs[i]=re.sub(r'#([^\s]+)', r'\1',testDocs[i])                       ## Removing the # symbol but keeping the word attached with it
    testDocs[i]=deEmojify(testDocs[i])                                        ## Removing the emojis
    testDocs[i] = testDocs[i].lower()                                         ## Converting to lowercase
    testDocs[i] = tokenizer.tokenize(testDocs[i])                             ## Split into words.
    nst=[]                                                                    ## A list to hold words not in stopwords list
    for j in testDocs[i]:
        if j not in stoplist:
            nst.append(j)
    testDocs[i]=nst                                                           ## Finally appending clean tokens


In [15]:
## Remove numbers, but not words that contain numbers:

testDocs = [[token for token in doc if not token.isnumeric()] for doc in testDocs]

## Remove words that are only one character:

testDocs = [[token for token in doc if len(token) > 1] for doc in testDocs]

In [16]:
## Transforming text to feature vectors that can be used as input to estimator using TFIDF Vectorizer:

## Setting analyzer to word for outputting words and phrases of ngram_range(unigrams, bigrams and trigrams)
## Setting min_df=3 indicating to not consider words appearing in less than 3 documents and max_df to 3050 meaning not to
## consider words appearing in more than 3050 documents:

word_vector=TfidfVectorizer(analyzer='word',input='content',
                           min_df=3,ngram_range=(1,3),max_df=3050,           
                          tokenizer=LemmaTokenizer()
                           )

## Setting analyzer to char for outputting character ngrams:

char_vector=TfidfVectorizer(analyzer='char',input='content',
                           min_df=0,ngram_range=(2,3),
                          tokenizer=LemmaTokenizer()
                           )                      

In [17]:
## Using FeatureUnion to combine word and character features:

vectorizer = FeatureUnion([("chars",char_vector),("words",word_vector)])

In [18]:
## Converting the data into a format which is suitable for TFIDF Vectorizer for training set:

inputc = [" ".join(x) for x in trainDocs]

In [19]:
## Applying the TFIDF Vectorizer on the training text:

x_train=vectorizer.fit_transform(inputc)
y_train=np.asarray(df['gender'])                                             ## Training labels

In [20]:
len(vectorizer.get_feature_names())

161412

In [21]:
## Converting the data into a format which is suitable for TFIDF Vectorizer for test set:

input_test = [" ".join(x) for x in testDocs]

In [22]:
## Applying the TFIDF Vectorizer on the testing text:

x_test=vectorizer.transform(input_test)

In [23]:
## Actual testing labels:

test_actual=pd.read_csv('test_labels.csv')
y_test=test_actual.gender.tolist()

## Model Building and Evaluation:

In [24]:
np.random.seed(94)                                                  ## Setting the random seed for reproducibility of results

from warnings import simplefilter
## ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

models = [  
    
    ## Define the PassiveAggressiveClassifier model to be used with regularization parameter set to 5.2:

    BaggingClassifier(PassiveAggressiveClassifier(C=5.2))               

]

for clf in models:                                                      ## Run the loop to use one model at each iteration in case of multiple models
    model_name = clf.__class__.__name__
    clf.fit(x_train, y_train)                                           ## Fit the model on the train set
    print(model_name)
    
                                                                        ## Prediction on test set
    y_predict=clf.predict(x_test)   
    print(confusion_matrix(y_test,y_predict))
    recall=recall_score(y_test,y_predict,average='macro')
    precision=precision_score(y_test,y_predict,average='macro')
    f1score=f1_score(y_test,y_predict,average='macro')
    accuracy=accuracy_score(y_test,y_predict)                           ## Accuracy score between true and predicted labels
    matthews = matthews_corrcoef(y_test,y_predict) 
    print('Accuracy: '+ str(accuracy))
    print('Macro Precision: '+ str(precision))
    print('Macro Recall: '+ str(recall))
    print('Macro F1 score:'+ str(f1score))
    print('MCC:'+ str(matthews))

   

BaggingClassifier
[[212  40]
 [ 38 210]]
Accuracy: 0.844
Macro Precision: 0.844
Macro Recall: 0.8440220174091142
Macro F1 score:0.8439975039600633
MCC:0.6880220170568243


In [25]:
## Saving the predicted labels to gender column of test dataframe:

df_test['gender']=y_predict

In [26]:
## Removing the language column from the test dataframe:

del df_test['language']

In [27]:
## Saving the predictions to a csv file:

df_test.to_csv('pred_labels.csv',index=False)

### References
<li>https://stackoverflow.com/questions/41089578/finding-xml-text-content-from-tag-name-in-python
    <li>https://stackoverflow.com/questions/43797500/python-replace-unicode-emojis-with-ascii-characters/43813727#43813727
        <li>https://www.slideshare.net/PyData/authorship-attribution-forensic-linguistics-with-python-scikit-learn-pandas-kostas-perifanos
            <li>Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.